[med-svn] [Git][med-team/sniffles][master] 4 commits: New upstream version 1.0.12+ds
Steffen Möller
gitlab at salsa.debian.org
Wed Sep 2 21:05:59 BST 2020
Steffen Möller pushed to branch master at Debian Med / sniffles
Commits:
e784116a by Steffen Moeller at 2020-09-02T20:48:05+02:00
New upstream version 1.0.12+ds
- - - - -
840e75e4 by Steffen Moeller at 2020-09-02T20:48:05+02:00
routine-update: New upstream version
- - - - -
e392e3e9 by Steffen Moeller at 2020-09-02T20:48:06+02:00
Update upstream source from tag 'upstream/1.0.12+ds'
Update to upstream version '1.0.12+ds'
with Debian dir 5ddeee051d0580bae1a0f687ae6ac4f2b6423489
- - - - -
2e3595cf by Steffen Moeller at 2020-09-02T22:04:37+02:00
Builds but prefer to delay upload because of presumed bug
Waiting for reaction by upstream to
https://github.com/fritzsedlazeck/Sniffles/pull/225
- - - - -
23 changed files:
- README.md
- debian/changelog
- src/Alignment.cpp
- src/BamParser.cpp
- src/Genotyper/Genotyper.cpp
- src/Genotyper/Genotyper.h
- src/Paramer.h
- src/Sniffles.cpp
- src/cluster/Cluster_SVs.cpp
- src/force_calling/Force_calling.cpp
- src/force_calling/VCF_parser.cpp
- src/print/BedpePrinter.cpp
- src/print/IPrinter.cpp
- src/print/IPrinter.h
- src/print/VCFPrinter.cpp
- src/print/VCFPrinter.h
- src/sub/Breakpoint.cpp
- src/sub/Breakpoint.h
- src/sub/Detect_Breakpoints.cpp
- src/sub/Detect_Breakpoints.h
- src/tree/Breakpoint_Tree.cpp
- src/tree/IntervallTree.cpp
- src/tree/TNode.h
Changes:
=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
# Sniffles
-Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter) or NGMLR with the optional SAM attributes enabled! If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
+Sniffles is a structural variation caller using third generation sequencing (PacBio or Oxford Nanopore). It detects all types of SVs (10bp+) using evidence from split-read alignments, high-mismatch regions, and coverage analysis. Please note the current version of Sniffles requires sorted output from BWA-MEM (use -M and -x parameter), Minimap2 (sam file with Cigar & MD string) or NGMLR. If you experience problems or have suggestions please contact: fritz.sedlazeck at gmail.com
Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
=====================================
debian/changelog
=====================================
@@ -1,3 +1,11 @@
+sniffles (1.0.12+ds-1) UNRELEASED; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * TODO/BLOCKER: Wait for reaction of upstream to https://github.com/fritzsedlazeck/Sniffles/pull/225
+
+ -- Steffen Moeller <moeller at debian.org> Wed, 02 Sep 2020 20:48:13 +0200
+
sniffles (1.0.11+ds-2) unstable; urgency=medium
[ Afif Elghraoui ]
=====================================
src/Alignment.cpp
=====================================
@@ -139,8 +139,8 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
}
int sum_mis = 0;
- int sum_events=0;
- int sum_single=0;
+ int sum_events = 0;
+ int sum_single = 0;
for (size_t i = 0; i < al->CigarData.size(); i++) {
if (al->CigarData[i].Type == 'M' || (al->CigarData[i].Type == '=' || al->CigarData[i].Type == 'X')) {
pos += al->CigarData[i].Length;
@@ -150,9 +150,9 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
ev.type = al->CigarData[i].Length; //deletion
ev.readposition = read_pos;
ev.resolved = true;
- if (al->CigarData[i].Length >2) {
+ if (al->CigarData[i].Length > 2) {
sum_events++;
- }else{
+ } else {
sum_single++;
}
events.push_back(ev);
@@ -162,9 +162,9 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
ev.resolved = true;
ev.readposition = read_pos;
ev.type = al->CigarData[i].Length * -1; //insertion
- if (al->CigarData[i].Length >2) {
+ if (al->CigarData[i].Length > 2) {
sum_events++;
- }else{
+ } else {
sum_single++;
}
@@ -174,7 +174,7 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
pos += al->CigarData[i].Length;
ev.resolved = true;
read_pos += al->CigarData[i].Length;
- } else if ((al->CigarData[i].Type == 'S' || al->CigarData[i].Type == 'H' )&& al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser
+ } else if ((al->CigarData[i].Type == 'S' || al->CigarData[i].Type == 'H') && al->CigarData[i].Length > Parameter::Instance()->huge_ins) { /// Used for reads ranging into an inser
string sa;
al->GetTag("SA", sa);
uint32_t sv;
@@ -188,19 +188,32 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
ev.resolved = false;
ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!]
events.push_back(ev);
+ } else if (!al->GetTag("SV", sv) && sa.empty()) {
+ if (flag) {
+ cout << "HIT ALN" << endl;
+ }
+ ev.position = pos; // - Parameter::Instance()->huge_ins;
+ if (i == 0) {
+ ev.readposition = 0;
+ } else {
+ ev.readposition = read_pos;
+ }
+ ev.resolved = false;
+ ev.type = Parameter::Instance()->huge_ins * -1; //insertion: WE have to fix the length since we cannot estimate it!]
+ events.push_back(ev);
}
}
}
//exit(0);
if (flag) {
- std::cout << "FIRST:" << std::endl;
- for (size_t i = 0; i < events.size(); i++) {
- // if (abs(events[i].type) > 200) {
- cout << events[i].position << " " << events[i].type << endl;
- // }
- }
- cout << endl;
+ /* std::cout << "FIRST:" << std::endl;
+ for (size_t i = 0; i < events.size(); i++) {
+ // if (abs(events[i].type) > 200) {
+ cout << events[i].position << " " << events[i].type << endl;
+ // }
+ }
+ cout << endl;*/
}
//set ref length requ. later on:
@@ -380,8 +393,12 @@ size_t Alignment::get_length(std::vector<CigarOp> CigarData) {
return len;
}
size_t Alignment::getRefLength() {
- return this->ref_len;
-// return get_length(this->al->CigarData);
+ if (this->ref_len < 0) {
+ return this->ref_len;
+ }
+// return get_length(this->getCigar());
+
+ return get_length(this->al->CigarData);
}
size_t Alignment::getOrigLen() {
return orig_length;
@@ -715,7 +732,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
uint32_t sv;
al->GetTag("SV", sv);
tmp.cross_N = ((sv & Ns_CLIPPED));
- bool flag = strcmp(getName().c_str(), "0bac61ef-7819-462b-ae3d-32c68fe580c0") == 0; //Parameter::Instance()->read_name.c_str()) == 0;
+ bool flag = false; // strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0;
get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop);
if (flag) {
@@ -949,7 +966,7 @@ std::string Alignment::get_md() {
} else {
std::cerr << "No MD string detected! Check bam file! Otherwise generate using e.g. samtools." << std::endl;
cout << "MD: TEST" << this->getName() << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
return md;
}
@@ -960,7 +977,7 @@ std::string Alignment::get_cs() {
return cs;
} else {
std::cerr << "No CS string detected! Check bam file!" << std::endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
return cs;
}
@@ -1144,6 +1161,9 @@ vector<str_event> Alignment::get_events_Aln() {
// event_aln = summarize_csstring(dels);
// } else {
event_aln = summarizeAlignment(dels);
+ if (flag) {
+ cout << "\tALN events " << event_aln.size() << endl;
+ }
// }
//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
@@ -1277,14 +1297,16 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.type = 0;
if (insert_max > Parameter::Instance()->min_length && insert > (del + del)) { //we have an insertion! //todo check || vs. &&
- if (is_N_region && insert_max * Parameter::Instance()->avg_ins < Parameter::Instance()->min_length) {
+ if (is_N_region && insert_max - (insert_max * Parameter::Instance()->avg_ins) < Parameter::Instance()->min_length) {
tmp.type = 0;
} else {
+
+ if (flag) {
+ cout << "Is INS" << endl;
+ }
+
tmp.length = insert_max; //TODO not sure!
while (start < stop && event_aln[start].readposition == -1) {
- if (flag) {
- cout << event_aln[start].readposition << " " << event_aln[start].type << endl;
- }
start++;
}
if (flag) {
@@ -1292,9 +1314,13 @@ vector<str_event> Alignment::get_events_Aln() {
}
tmp.read_pos = event_aln[start].readposition;
if (Parameter::Instance()->print_seq) {
- if (flag) {
- std::cout << "Seq+:" << this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length) << std::endl;
+ // if (flag) {
+ // std::cout << "Seq+:" << this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length) << std::endl;
+ // }
+ if (this->getAlignment()->QueryBases.size() < tmp.read_pos) {
+ cerr << "Read sequence is shorter than expected. Please check your bam file if the read sequence is reported!" << endl;
+ exit(-1);
}
tmp.sequence = this->getAlignment()->QueryBases.substr(tmp.read_pos, tmp.length);
} else {
@@ -1305,7 +1331,7 @@ vector<str_event> Alignment::get_events_Aln() {
tmp.is_noise = false;
}
} else if (del_max > Parameter::Instance()->min_length && (insert + insert) < del) { //deletion
- if (is_N_region && del_max * Parameter::Instance()->avg_del < Parameter::Instance()->min_length) {
+ if (is_N_region && del_max - (del_max * Parameter::Instance()->avg_del) < Parameter::Instance()->min_length) {
tmp.type = 0;
} else {
if (Parameter::Instance()->print_seq) {
@@ -1340,18 +1366,33 @@ vector<str_event> Alignment::get_events_Aln() {
cout << event_aln[start].position << " " << event_aln[stop].position << endl;
cout << "store: " << tmp.pos << " " << tmp.pos + abs(tmp.length) << " " << tmp.length << endl;
cout << tmp.sequence << endl;
+ cout << tmp.type << endl;
cout << endl;
}
if (tmp.type != 0) {
+ if (flag) {
+ cout << "ADDED" << endl;
+ }
events.push_back(tmp);
+ } else {
+ if (flag) {
+ cout << "NOT ADDED" << endl;
+ }
}
}
}
// Parameter::Instance()->meassure_time(comp_aln, "\tcompPosition: ");
if (noise_events > 4) {
+ if (flag) {
+ cout << "!dumped" << endl;
+ }
events.clear();
}
+
+ if (flag) {
+ cout << "events" << events.size() << endl;
+ }
return events;
}
=====================================
src/BamParser.cpp
=====================================
@@ -13,7 +13,7 @@ BamParser::BamParser(string file){
if(!reader.Open(tmps)){
cerr<<"BAM Parser: could not open file: "<<file<<endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
}
@@ -23,7 +23,7 @@ Alignment* BamParser::parseRead(uint16_t mappingQv){
Alignment *align = new Alignment();
BamAlignment* al = new BamAlignment();
while(reader.GetNextAlignmentCore(al[0])){
- if( al->IsMapped() && al->MapQuality > mappingQv){
+ if( al->IsMapped() && al->MapQuality >= mappingQv){ // >??
al->BuildCharData();
align->setAlignment(al);
return align;
@@ -42,7 +42,7 @@ void BamParser::parseReadFast(uint16_t mappingQv,Alignment*& align){
align->clear_QueryBases();
while(reader.GetNextAlignmentCore(al[0])){
- if( al->IsMapped() && al->MapQuality > mappingQv){
+ if( al->IsMapped() && al->MapQuality >= mappingQv){
al->BuildCharData();
align->setAlignment(al);
return;
=====================================
src/Genotyper/Genotyper.cpp
=====================================
@@ -7,84 +7,548 @@
#include "Genotyper.h"
-std::string Genotyper::assess_genotype(int ref, int support) {
- double allele = (double) support / (double) (support + ref);
+long get_ref_lengths3(int id, RefVector ref) {
+ long length = 0;
- if (allele < Parameter::Instance()->min_allelel_frequency) {
- return "";
+ for (size_t i = 0; i < (size_t) id && i < ref.size(); i++) {
+ length += (long) ref[i].RefLength + (long) Parameter::Instance()->max_dist;
+ }
+ return length;
+}
+
+void update_entries(std::vector<str_breakpoint_slim> &entries, int start, int stop, size_t & current_pos, int wobble, std::string rname, bool strand) { //TODO room for optimization!
+
+ if (entries.empty() || stop + wobble < entries[0].pos) {
+ return;
+ }
+// cout<<"PS: "<<current_pos<<endl;
+// cout << "cov: " << start << " " << stop << endl;
+ for (size_t i = current_pos; i < entries.size(); i++) {
+ if (entries[i].pos < start - wobble) {
+ current_pos = i;
+ }
+ if ((start - wobble < entries[i].pos && stop + wobble > entries[i].pos) && (abs(entries[i].pos - start) > wobble && abs(entries[i].pos - stop) > wobble)) { //TODO not sure if I cannot combine these two.
+ //entries[i].cov++;
+ entries[i].rnames[rname] = strand; //TOOD maybe just a normal vector!
+// cout << "\tHIT: " << entries[i].rnames.size() << endl;
+ }
+ if (entries[i].pos > stop + wobble) {
+ break;
+ }
+ }
+}
+
+void update_coverage(std::map<std::string, std::vector<str_breakpoint_slim> > & entries) {
+
+ //run across BAM file to parse intervals and compute coverage
+ //tree.overlaps <- strand 1 = true , else = false??
+ //do I need to think about filtering etc?
+
+ cout << "\tReopening Bam file for parsing coverage " << endl;
+ RefVector ref;
+ BamParser * mapped_file = 0;
+ if (Parameter::Instance()->bam_files[0].find("bam") != string::npos) {
+ mapped_file = new BamParser(Parameter::Instance()->bam_files[0]);
+ ref = mapped_file->get_refInfo();
+ } else {
+ cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
+ long num_reads = 0;
+
+ int current_RefID = tmp_aln->getRefID();
+ std::vector<str_breakpoint_slim> tmp = entries[ref[tmp_aln->getRefID()].RefName];
+
+ size_t current_pos = 0; // index of entries vector;
+
+ while (!tmp_aln->getQueryBases().empty()) {
+
+ if (current_RefID != tmp_aln->getRefID()) {
+ current_pos = 0;
+ entries[ref[current_RefID].RefName] = tmp;
+ std::cout << "\t\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl;
+ current_RefID = tmp_aln->getRefID();
+ tmp = entries[ref[current_RefID].RefName];
+ }
+
+ int start = (int) tmp_aln->getPosition();
+ int stop = (int) start + tmp_aln->getRefLength();
+ // cout << "Ref: " << ref[current_RefID].RefName << endl;
+ update_entries(tmp, start, stop, current_pos, 5, tmp_aln->getName(), tmp_aln->getStrand());
+
+ mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
+ }
+
+ entries[ref[current_RefID].RefName] = tmp;
+ /* cout << "Check:" << endl;
+ for (std::map<std::string, std::vector<str_breakpoint_slim> >::iterator i = entries.begin(); i != entries.end(); i++) {
+ for (size_t j = 0; j < (*i).second.size(); j++) {
+ cout << (*i).second[j].chr << " " << (*i).second[j].pos<<" "<< (*i).second[j].cov<< endl;
+ }
+ }*/
+
+ std::cout << "\tFinalizing .." << std::endl;
+}
+
+str_breakpoint_slim init_breakpoint() {
+ str_breakpoint_slim tmp;
+ tmp.chr = "";
+// tmp.cov = 0;
+// tmp.is_start = false;
+ tmp.pos = 0;
+ return tmp;
+}
+
+void Genotyper::get_breakpoint_vcf(string buffer, str_breakpoint_slim & start, str_breakpoint_slim & stop) {
+ size_t i = 0;
+ int count = 0;
+
+ start = init_breakpoint();
+ stop = init_breakpoint();
+
+ while (buffer[i] != '\0' && buffer[i] != '\n') {
+ if (count == 0 && buffer[i] != '\t') {
+ start.chr += buffer[i];
+ }
+ if (count == 1 && buffer[i - 1] == '\t') {
+ start.pos = atoi(&buffer[i]);
+
+ }
+ if (stop.pos == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) { //FOR TRA
+ parse_pos(&buffer[i - 1], stop.pos, stop.chr);
+ }
+
+ if (count > 6 && strncmp(";CHR2=", &buffer[i], 6) == 0) {
+ i += 6;
+ while (buffer[i] != ';') {
+ stop.chr += buffer[i];
+ i++;
+ }
+ }
+ if (count > 6 && strncmp(";END=", &buffer[i], 5) == 0) {
+ stop.pos = atoi(&buffer[i + 5]); //stores right most breakpoint
+ break;
+ }
+
+ if (buffer[i] == '\t') {
+ count++;
+ }
+ i++;
+ }
+}
+
+variant_str Genotyper::get_breakpoint_bedpe(string buffer, str_breakpoint_slim & start, str_breakpoint_slim & stop) {
+ size_t i = 0;
+ int count = 0;
+ std::string chr;
+ variant_str tmp;
+ start = init_breakpoint();
+ stop = init_breakpoint();
+
+ while (buffer[i] != '\0' && buffer[i] != '\n') {
+ if (count == 12 && buffer[i] != '\t') {
+ start.chr += buffer[i];
+ }
+
+ if (count == 13 && buffer[i - 1] == '\t') {
+ start.pos = atoi(&buffer[i]);
+ }
+ if (count == 14 && buffer[i] != '\t') {
+ stop.chr += buffer[i];
+ }
+ if (count == 15 && buffer[i - 1] == '\t') {
+ stop.pos = atoi(&buffer[i]);
+ break;
+ }
+ if (buffer[i] == '\t') {
+ count++;
+ }
+ i++;
+ }
+ return tmp;
+}
+
+void insert_sorted_entry(std::map<std::string, std::vector<str_breakpoint_slim> > & entries, str_breakpoint_slim pos) {
+ size_t i = 0;
+ while (i < entries[pos.chr].size() && entries[pos.chr][i].pos < pos.pos) {
+ i++;
+ }
+ if (entries.size() == i) {
+ entries[pos.chr].push_back(pos);
+ } else {
+ std::vector<str_breakpoint_slim>::iterator it = entries[pos.chr].begin();
+ entries[pos.chr].insert(it + i, pos);
+ }
+}
+void Genotyper::read_SVs(std::map<std::string, std::vector<str_breakpoint_slim> > & entries) {
+
+ //cout<<"READ SV "<<endl;
+ std::ifstream myfile;
+ bool is_vcf = !Parameter::Instance()->output_vcf.empty();
+
+ if (!Parameter::Instance()->output_vcf.empty()) {
+ myfile.open(Parameter::Instance()->output_vcf.c_str(), std::ifstream::in);
+ } else if (!Parameter::Instance()->output_bedpe.empty()) {
+ myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in);
+ }
+
+ if (!myfile.good()) {
+ std::cout << "SVParse: could not open file: " << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ string buffer;
+
+ getline(myfile, buffer);
+
+ int num_sv = 0;
+ while (!myfile.eof()) {
+ if (buffer[0] != '#') {
+
+ str_breakpoint_slim start;
+ str_breakpoint_slim stop;
+ if (is_vcf) {
+ get_breakpoint_vcf(buffer, start, stop);
+ } else {
+ //get_breakpoint_bedpe(buffer); //TODO
+ }
+
+ // if (start.pos == 10441961) {
+ // cout << "Found: " << start.chr << " " << start.pos << " " << stop.chr << " " << stop.pos << endl;
+ // }
+
+ //we want a sorted inclusion!
+ insert_sorted_entry(entries, start);
+ insert_sorted_entry(entries, stop);
+
+ num_sv++;
+ if (num_sv % 5000 == 0) {
+ cout << "\t\tRead in SV: " << num_sv << endl;
+ }
+ }
+ getline(myfile, buffer);
+ }
+ myfile.close();
+
+ /*cout << "Check:" << endl;
+ for (std::map<std::string, std::vector<str_breakpoint_slim> >::iterator i = entries.begin(); i != entries.end(); i++) {
+ for (size_t j = 0; j < (*i).second.size(); j++) {
+ //if ((*i).second[j].pos == 10441961) {
+ cout << (*i).second[j].chr << " " << (*i).second[j].pos << endl;
+ //}
+ }
+ }*/
+}
+
+void Genotyper::update_svs_output(std::map<std::string, std::vector<str_breakpoint_slim> > entries) {
+ std::ifstream myfile;
+ bool is_vcf = !Parameter::Instance()->output_vcf.empty();
+
+ string file_name;
+ if (!Parameter::Instance()->output_vcf.empty()) {
+ file_name = Parameter::Instance()->output_vcf;
+ myfile.open(Parameter::Instance()->output_vcf.c_str(), std::ifstream::in);
+ } else if (!Parameter::Instance()->output_bedpe.empty()) {
+ file_name = Parameter::Instance()->output_bedpe;
+ myfile.open(Parameter::Instance()->output_bedpe.c_str(), std::ifstream::in);
+ }
+
+ FILE*file = fopen(Parameter::Instance()->tmp_file.c_str(), "w"); //
+
+ if (!myfile.good()) {
+ std::cout << "SVParse: could not open file: " << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ string buffer;
+ getline(myfile, buffer);
+//parse SVs breakpoints in file
+
+ while (!myfile.eof()) {
+ if (buffer[0] != '#') {
+
+ std::string to_print;
+ // create binary tree to hold breakpoints!
+ variant_str tmp;
+ str_breakpoint_slim start;
+ str_breakpoint_slim stop;
+ if (is_vcf) {
+ get_breakpoint_vcf(buffer, start, stop);
+ } else {
+ //get_breakpoint_bedpe(buffer); //TODO
+ }
+ map<std::string, bool> tmp_names;
+ pair<int, int> strands;
+ strands.first = 0; //+
+ strands.second = 0; //-
+ for (size_t i = 0; i < entries[start.chr].size(); i++) {
+ if (start.pos == entries[start.chr][i].pos) {
+ // final_ref.first = entries[start.chr][i].cov;
+ for (map<std::string, bool>::iterator t = entries[start.chr][i].rnames.begin(); t != entries[start.chr][i].rnames.end(); t++) {
+ if ((*t).second) {
+ strands.first++;
+ } else {
+ strands.second++;
+ }
+ tmp_names[(*t).first] = (*t).second;
+ }
+ break;
+ }
+ }
+
+ for (size_t i = 0; i < entries[stop.chr].size(); i++) {
+ if (stop.pos == entries[stop.chr][i].pos) {
+ //final_ref.second = entries[stop.chr][i].cov;
+ for (map<std::string, bool>::iterator t = entries[stop.chr][i].rnames.begin(); t != entries[stop.chr][i].rnames.end(); t++) {
+ tmp_names[(*t).first] = (*t).second;
+ if ((*t).second) {
+ strands.first++;
+ } else {
+ strands.second++;
+ }
+ }
+
+ break;
+ }
+ }
+
+ if (tmp_names.empty() == -1) {
+ std::cerr << "Error in GT: Tree node not found. Exiting." << std::endl;
+ exit(EXIT_FAILURE);
+ }
+ if (is_vcf) {
+ to_print = mod_breakpoint_vcf(buffer, strands.first, strands.second); //(int) tmp_names.size());
+ } else {
+ to_print = mod_breakpoint_bedpe(buffer, strands.first, strands.second); //(int) tmp_names.size());
+ }
+ if (!to_print.empty()) {
+ fprintf(file, "%s", to_print.c_str());
+ fprintf(file, "%c", '\n');
+ }
+ } else {
+ fprintf(file, "%s", buffer.c_str());
+ fprintf(file, "%c", '\n');
+ }
+
+ getline(myfile, buffer);
}
- if((support + ref)==0){
- allele=0;
+ myfile.close();
+ fclose(file);
+
+ string move = "mv ";
+ move += Parameter::Instance()->tmp_file;
+ move += " ";
+ move += file_name;
+ system(move.c_str());
+}
+
+void Genotyper::update_SVs2() {
+//parse SVs not just breakpoints and update with the coverage info
+ //per CHR:
+ // Reconstruct the tree: similar to the IVCF parser?
+ // Compute the coverage per chr (<can be in parallele to 1)
+ // Intersect result from 1 + 2 -> print out.
+ // Done;
+ RefVector ref;
+ BamParser * mapped_file = 0;
+ if (Parameter::Instance()->bam_files[0].find("bam") != string::npos) {
+ mapped_file = new BamParser(Parameter::Instance()->bam_files[0]);
+ ref = mapped_file->get_refInfo();
+ } else {
+ cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
+ exit(EXIT_FAILURE);
}
+ std::map<std::string, std::vector<str_breakpoint_slim> > entries;
+
+ //initialize for chrs
+ std::vector<str_breakpoint_slim> tmp;
+ for (size_t i = 0; i < ref.size(); i++) {
+ entries[ref[i].RefName] = tmp;
+ }
+
+ //parse + include sorted
+ read_SVs(entries);
+
+ //obtain coverage
+ update_coverage(entries);
+
+ //Update VCF file for these entries and put it in a tmp file.
+ update_svs_output(entries);
+
+ string del = "rm ";
+ del += Parameter::Instance()->tmp_genotyp;
+ system(del.c_str());
+}
+
+//================================================================================================
+//================================================================================================
+//================================================================================================
+//================================================================================================
+
+std::string Genotyper::assess_genotype(int ref, int support) {
+ double allele = 0;
+ if (!Parameter::Instance()->testing) {
+ ref += support; // just for now!
+ }
+ if ((ref) > 0) {
+ allele = (double) support / (double) ref; //(support + ref);
+ }
+ if (allele < Parameter::Instance()->min_allelel_frequency) {
+ return "";
+ }
std::stringstream ss;
ss << ";AF=";
ss << allele;
ss << "\tGT:DR:DV\t";
- if(ref==0 && support==0){
+ if (ref < 2) {
ss << "./."; //we cannot define it.
- }else if (allele > Parameter::Instance()->homfreq) {
+ } else if (allele > Parameter::Instance()->homfreq) {
ss << "1/1";
} else if (allele > Parameter::Instance()->hetfreq) {
ss << "0/1";
} else {
ss << "0/0";
}
- ss<<":";
- ss << ref;
+ ss << ":";
+ if (ref < support) {
+ cerr << "Warning @Genotyper:refrence coverage. Please report this! " << endl;
+ ss << ref - support;
+ } else {
+ ss << ref - support;
+ }
ss << ":";
ss << support;
return ss.str();
}
-std::string Genotyper::mod_breakpoint_vcf(string buffer, std::pair<int, int> ref_strand) {
- int ref=ref_strand.first+ref_strand.second;
- //find last of\t
- //parse #reads supporting
- //print #ref
- string entry;
- int pos = 0;
+//// ============== Fisher exact test for strandness ===========
+void initLogFacs(double* logFacs, int n) {
+ logFacs[0] = 0;
+ for (int i = 1; i < n + 1; ++i) {
+ logFacs[i] = logFacs[i - 1] + log((double) i); // only n times of log() calls
+ }
+}
+
+double logHypergeometricProb(double* logFacs, int a, int b, int c, int d) {
+ return logFacs[a + b] + logFacs[c + d] + logFacs[a + c] + logFacs[b + d] - logFacs[a] - logFacs[b] - logFacs[c] - logFacs[d] - logFacs[a + b + c + d];
+}
+
+double logFac(int n) {
+ double ret;
+ for (ret = 0.; n > 0; --n) {
+ ret += log((double) n);
+ }
+ return ret;
+}
+double logHypergeometricProb(int a, int b, int c, int d) {
+ return logFac(a + b) + logFac(c + d) + logFac(a + c) + logFac(b + d) - logFac(a) - logFac(b) - logFac(c) - logFac(d) - logFac(a + b + c + d);
+}
+
+double fisher_exact(int sv_plus, int sv_minus, int ref_plus, int ref_minus) {
+ int n = sv_plus + sv_minus + ref_plus + ref_minus;
+ double* logFacs = new double[n + 1]; // *** dynamically allocate memory logFacs[0..n] ***
+ initLogFacs(logFacs, n); // *** initialize logFacs array ***
+ double logpCutoff = logHypergeometricProb(logFacs, sv_plus, sv_minus, ref_plus, ref_minus); // *** logFacs added
+ double pFraction = 0;
+ for (int x = 0; x <= n; ++x) {
+ if (sv_plus + sv_minus - x >= 0 && sv_plus + ref_plus - x >= 0 && ref_minus - sv_plus + x >= 0) {
+ double l = logHypergeometricProb(logFacs, x, sv_plus + sv_minus - x, sv_plus + ref_plus - x, ref_minus - sv_plus + x);
+ if (l <= logpCutoff)
+ pFraction += exp(l - logpCutoff);
+ }
+ }
+ double logpValue = logpCutoff + log(pFraction);
+// std::cout << "Two-sided log10-p-value is " << logpValue / log(10.) << std::endl;
+// std::cout << "Two-sided p-value is " << exp(logpValue) << std::endl;
+ delete[] logFacs;
+ return exp(logpValue);
+}
+std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref_plus, int ref_minus) {
+//find last of\t
+//parse #reads supporting
+//print #ref
+
+ string entry;
+ size_t pos = 0;
+ pair<int, int> read_strands;
+ pos = buffer.find("STRANDS2=");
+ if (pos != string::npos) {
+ read_strands.second = 0;
+ while (pos < buffer.size() && buffer[pos] != '\t') {
+ if (buffer[pos - 1] == '=') {
+ read_strands.first = atoi(&buffer[pos]);
+ }
+ if (buffer[pos - 1] == ',') {
+ read_strands.second = atoi(&buffer[pos]);
+ break;
+ }
+ pos++;
+ }
+ }
+ //cout<<"start2 "<<read_strands.first<<" "<< read_strands.second <<endl;
+ double pval = fisher_exact(read_strands.first, read_strands.second,ref_plus,ref_minus);
+ //cout<<"next"<<endl;
pos = buffer.find_last_of("GT");
- //tab
+//tab
entry = buffer.substr(0, pos - 2);
std::stringstream ss;
ss << ";REF_strand=";
- ss << ref_strand.first;
- ss << ",";
- ss << ref_strand.second;
- entry +=ss.str();
-
buffer = buffer.substr(pos + 1); // the right part is only needed:
pos = buffer.find_last_of(':');
int support = atoi(buffer.substr(pos + 1).c_str());
- string msg = assess_genotype(ref, support);
+ int ref_strand = max(ref_plus + ref_minus, support); // TODO not nice but just to make sure.
+ ss << ref_plus << "," << ref_minus;
+ ss << ";Strandbias_pval=" << pval;
+ entry += ss.str();
+
+ if(read_strands.first+read_strands.second>5 && pval<0.01){
+ pos=0;
+ int count=0;
+ for(size_t i=0;i<entry.size();i++){
+ if(entry[i]=='.'){
+ pos=i+2; //for avoiding . and \t
+ }
+ if(entry[i]=='\t' && pos!=0){
+ count++;
+ if(count==2){
+ entry.erase(pos,i-pos);
+ entry.insert(pos,"STRANDBIAS");
+ }
+ }
+ }
+
+ }
+
+ string msg = assess_genotype(ref_strand, support);
if (msg.empty()) {
return "";
}
entry += msg;
+ //cout<<"done"<<endl;
return entry;
}
-std::string Genotyper::mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref_strand) {
+std::string Genotyper::mod_breakpoint_bedpe(string buffer, int ref_plus, int ref_minus) {
- int ref=ref_strand.first+ref_strand.second;
std::string tmp = buffer;
std::string entry = tmp;
entry += '\t';
- //int ref = max(tree.get_ref(node,var.chr,var.pos),tree.get_ref(node,var.chr2,var.pos2));
+//int ref = max(tree.get_ref(node,var.chr,var.pos),tree.get_ref(node,var.chr2,var.pos2));
int pos = tmp.find_last_of('\t'); //TODO!!
int support = atoi(tmp.substr(pos + 1).c_str());
- double allele = (double) support / (double) (support + ref);
+ double allele = (double) support / (double) (ref_plus + ref_minus);
if (allele < Parameter::Instance()->min_allelel_frequency) {
return "";
}
std::stringstream ss;
- ss << ref;
+ ss << ref_plus << "," << ref_minus;
ss << "\t";
ss << support;
entry += ss.str();
@@ -111,7 +575,7 @@ void Genotyper::parse_pos(char * buffer, int & pos, std::string & chr) {
}
variant_str Genotyper::get_breakpoint_vcf(string buffer) {
- //TODO extend for TRA!
+//TODO extend for TRA!
size_t i = 0;
int count = 0;
@@ -193,12 +657,12 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
if (!myfile.good()) {
std::cout << "SVParse: could not open file: " << std::endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
string buffer;
getline(myfile, buffer);
- //parse SVs breakpoints in file
+//parse SVs breakpoints in file
while (!myfile.eof()) { // TODO:if first -> we need to define AF!
if (buffer[0] != '#') {
@@ -212,24 +676,27 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
tmp = get_breakpoint_bedpe(buffer);
}
- std::pair<int,int> first_node=tree.get_ref(node, tmp.chr, tmp.pos);
- std::pair<int,int> second_node=tree.get_ref(node, tmp.chr2, tmp.pos2);
+ std::pair<int, int> first_node = tree.get_ref(node, tmp.chr, tmp.pos);
+ std::pair<int, int> second_node = tree.get_ref(node, tmp.chr2, tmp.pos2);
- std::pair<int,int> final_ref;
- if(first_node.first+first_node.second>second_node.first+second_node.second){
- final_ref=first_node;
- }else{
- final_ref=second_node;
+ std::pair<int, int> final_ref;
+ if (first_node.first + first_node.second > second_node.first + second_node.second) {
+ final_ref = first_node;
+ } else {
+ final_ref = second_node;
}
- if(final_ref.first==-1){
- std::cerr<<"Error in GT: Tree node not found. Exiting."<<std::endl;
- exit(0);
+ if (final_ref.first == -1) {
+ std::cerr << "Error in GT: Tree node not found. Exiting." << std::endl;
+ exit(EXIT_FAILURE);
}
+
+ //int ref = final_ref.first + final_ref.second;
+
if (is_vcf) {
- to_print = mod_breakpoint_vcf(buffer,final_ref);
+ to_print = mod_breakpoint_vcf(buffer, final_ref.first, final_ref.second);
} else {
- to_print = mod_breakpoint_bedpe(buffer,final_ref);
+ to_print = mod_breakpoint_bedpe(buffer, final_ref.first, final_ref.second);
}
if (!to_print.empty()) {
fprintf(file, "%s", to_print.c_str());
@@ -265,15 +732,15 @@ std::vector<std::string> Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_
if (!myfile.good()) {
std::cout << "SVParse: could not open file: " << std::endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
- //size_t buffer_size = 250000000;
+//size_t buffer_size = 250000000;
string buffer;
getline(myfile, buffer);
- //char* buffer = new char[buffer_size];
- //myfile.getline(buffer, buffer_size);
- //parse SVs breakpoints in file
+//char* buffer = new char[buffer_size];
+//myfile.getline(buffer, buffer_size);
+//parse SVs breakpoints in file
int num_sv = 0;
int prev_pos1 = 0;
@@ -309,20 +776,54 @@ std::vector<std::string> Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_
}
myfile.close();
return ref_dict;
- //tree.inorder(node);
+//tree.inorder(node);
}
void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std::vector<std::string> ref_dict) {
- FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r");
- if (ref_allel_reads == NULL) {
- std::cerr << "CovParse: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
- }
- //check if we want to compute the full coverage!
+ /*FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r");
+ if (ref_allel_reads == NULL) {
+ std::cerr << "CovParse: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
+ }
+ //check if we want to compute the full coverage!
+
+ size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);*/
+ std::string buffer;
+ std::ifstream myfile;
+
str_read tmp;
- size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+//cout<<"File: "<< Parameter::Instance()->tmp_genotyp.c_str()<<endl;
+ myfile.open(Parameter::Instance()->tmp_genotyp.c_str(), std::ifstream::in);
+ if (!myfile.good()) {
+ std::cout << "Genotype Parser: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
+ exit(0);
+ }
int prev_id = -1;
- while (nbytes != 0) {
- // std::cout<<"Read: "<<" " <<tmp.chr_id<<":"<<ref_dict[tmp.chr_id]<<" " <<tmp.start<<" "<<tmp.length<<std::endl;
+ int num_reads = 0;
+
+ getline(myfile, buffer);
+//cout<<buffer<<endl;
+ while (!myfile.eof()) {
+ int count = 0;
+
+ tmp.chr_id = atoi(&buffer[0]);
+ for (size_t i = 0; i < buffer.size() && buffer[i] != '\n' && buffer[i] != '\0'; i++) {
+ if (count == 1 && buffer[i - 1] == '\t') {
+ tmp.start = atoi(&buffer[i]);
+ }
+ if (count == 2 && buffer[i - 1] == '\t') {
+ tmp.length = atoi(&buffer[i]);
+ }
+ if (count == 3 && buffer[i - 1] == '\t') {
+ tmp.strand = atoi(&buffer[i]);
+ break;
+ }
+ if (buffer[i] == '\t') {
+ count++;
+ }
+ }
+
+ //while (nbytes != 0) {
+ // std::cout << "Read: " << " " << tmp.chr_id << ":" << ref_dict[tmp.chr_id] << " " << tmp.start << " " << tmp.length << std::endl;
if (prev_id != tmp.chr_id) {
cout << "\t\tScanning CHR " << ref_dict[tmp.chr_id] << endl;
prev_id = tmp.chr_id;
@@ -332,14 +833,19 @@ void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std
} else {
tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node, false);
}
- nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ //nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ num_reads++;
+ getline(myfile, buffer);
}
- fclose(ref_allel_reads);
+
+//cout << "Num: " << num_reads << endl;
+ myfile.close();
+//fclose (ref_allel_reads);
// tree.inorder(node);
}
void Genotyper::update_SVs() {
- //parse SVs not just breakpoints and update with the coverage info
+//parse SVs not just breakpoints and update with the coverage info
cout << "\tConstruct tree" << endl;
std::vector<std::string> ref_dict = read_SVs(this->tree, this->node);
cout << "\tUpdate reference alleles" << endl;
@@ -353,7 +859,6 @@ void Genotyper::update_SVs() {
}
void Genotyper::update_SVs(std::vector<Breakpoint *> & svs, long ref_space) { //refspace for the ref reads!!
-
FILE * ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "r");
if (ref_allel_reads == NULL) {
std::cerr << "Genotype Parser: could not open file: " << Parameter::Instance()->tmp_genotyp << std::endl;
@@ -361,6 +866,9 @@ void Genotyper::update_SVs(std::vector<Breakpoint *> & svs, long ref_space) { //
str_read tmp;
size_t nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
int num_reads = 0;
+
+//Do the tree per chr to reduce the complexity?
+//
while (nbytes != 0) {
for (size_t i = 0; i < svs.size(); i++) {
if (svs[i]->get_valid()) {
=====================================
src/Genotyper/Genotyper.h
=====================================
@@ -10,12 +10,22 @@
#include "../Paramer.h"
#include "../print/IPrinter.h"
#include "../tree/Breakpoint_Tree.h"
+//#include "../sub/Detect_Breakpoints.h"
struct variant_str{
std::string chr;
std::string chr2;
int pos;
int pos2;
+ int len;
};
+struct str_breakpoint_slim{
+ int pos;
+// int cov;
+ std::map<std::string,bool> rnames;
+ //bool is_start;
+ std::string chr;
+};
+
class Genotyper{
private:
Breakpoint_Tree tree;
@@ -25,10 +35,13 @@ private:
void update_file(Breakpoint_Tree & tree,breakpoint_node *& node);
variant_str get_breakpoint_vcf(string buffer);
variant_str get_breakpoint_bedpe(string buffer);
- std::string mod_breakpoint_vcf(string buffer, std::pair<int,int> ref);
- std::string mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref);
+ std::string mod_breakpoint_vcf(string buffer, int ref_plus, int ref_minus);
+ std::string mod_breakpoint_bedpe(string buffer, int ref_plus, int ref_minus);
void parse_pos(char * buffer, int & pos, std::string & chr);
-
+ void get_breakpoint_vcf(string buffer,str_breakpoint_slim & start,str_breakpoint_slim & stop);
+ void read_SVs(std::map<std::string, std::vector<str_breakpoint_slim> > & entries);
+ void update_svs_output(std::map<std::string, std::vector<str_breakpoint_slim> > entries);
+ variant_str get_breakpoint_bedpe(string buffer, str_breakpoint_slim & start, str_breakpoint_slim & stop);
public:
Genotyper(){
@@ -38,6 +51,7 @@ public:
}
void update_SVs();
+ void update_SVs2();
void update_SVs(std::vector<Breakpoint *> & points,long ref_space);
std::string assess_genotype(int ref, int support);
};
=====================================
src/Paramer.h
=====================================
@@ -34,6 +34,7 @@ private:
static Parameter* m_pInstance;
std::vector<region_str> regions;
public:
+
std::string output_vcf;
std::string output_bedpe;
std::string ref_seq;
@@ -75,7 +76,8 @@ public:
int min_zmw;
// bool splitthreader_output;
- bool debug;
+ bool testing;
+// bool debug;
bool genotype;
bool phase;
bool ignore_std;
@@ -86,6 +88,7 @@ public:
bool cs_string;
bool read_strand;
bool ccs_reads;
+ bool str;
void set_regions(std::string reg) {
size_t i = 0;
=====================================
src/Sniffles.cpp
=====================================
@@ -5,7 +5,6 @@
// Copyright : MIT License
// Description : Detection of SVs for long read data.
//============================================================================
-//For mac: cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
#include <iostream>
#include "Paramer.h"
#include <tclap/CmdLine.h>
@@ -27,9 +26,9 @@
//cmake -D CMAKE_C_COMPILER=/usr/local/bin/gcc-8 -D CMAKE_CXX_COMPILER=/usr/local/bin/g++-8 ..
-
-
//TODO:
+// Check the calibration again in the beginnig (Iceland study)
+
//check strand headers.
// strand bias??
// I think you could make your performance on PacBio reads even better with a few modifications:
@@ -65,6 +64,8 @@ void read_parameters(int argc, char *argv[]) {
// TCLAP::CmdLine cmd("", ' ', "", true);
TCLAP::CmdLine cmd("Sniffles version ", ' ', Parameter::Instance()->version);
+ TCLAP::ValueArg<std::string> arg_readname("", "test_read", "readname", false, "", "string", cmd);// for testing only!
+
TCLAP::ValueArg<std::string> arg_bamfile("m", "mapped_reads", "Sorted bam File", true, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_vcf("v", "vcf", "VCF output file name", false, "", "string", cmd);
TCLAP::ValueArg<std::string> arg_input_vcf("", "Ivcf", "Input VCF file name. Enable force calling", false, "", "string", cmd);
@@ -85,19 +86,20 @@ void read_parameters(int argc, char *argv[]) {
TCLAP::ValueArg<int> arg_parameter_maxdist("", "max_dist_aln_events", "Maximum distance between alignment (indel) events.", false, 4, "int", cmd);
TCLAP::ValueArg<int> arg_parameter_maxdiff("", "max_diff_per_window", "Maximum differences per 100bp.", false, 50, "int", cmd);
- TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
+ TCLAP::SwitchArg arg_genotype("", "genotype", "Inactivated: Automatically true.", cmd, true);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. ", cmd, false);
TCLAP::SwitchArg arg_bnd("", "report_BND", "Dont report BND instead use Tra in vcf output. ", cmd, true);
- TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) ", cmd, false);
+ TCLAP::SwitchArg arg_seq_old("", "report-seq", "Inactivated (see not_report_seq).", cmd, false);
+ TCLAP::SwitchArg arg_seq("", "not_report_seq", "Dont report sequences for indels in vcf output. (Beta version!) ", cmd, false);
TCLAP::SwitchArg arg_coords("", "change_coords", "Adopt coordinates for force calling if finding evidence. ", cmd, false);
TCLAP::SwitchArg arg_parameter("", "skip_parameter_estimation", "Enables the scan if only very few reads are present. ", cmd, false);
TCLAP::SwitchArg arg_cs_string("", "cs_string", "Enables the scan of CS string instead of Cigar and MD. ", cmd, false);
- TCLAP::SwitchArg arg_read_strand("", "report_read_strands", "Enables the report of the strand categories per read. (Beta) ", cmd, false);
+ //TCLAP::SwitchArg arg_read_strand("", "report_read_strands", "Enables the report of the strand categories per read. (Beta) ", cmd, false);
+ TCLAP::SwitchArg arg_str("", "report_str", "Enables the report of str. (alpha testing) ", cmd, false);
TCLAP::SwitchArg arg_ccs("", "ccs_reads", "Preset CCS Pacbio setting. (Beta) ", cmd, false);
-
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). ", false, 0.0, "float", cmd);
TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). ", false, 0.3, "float", cmd);
TCLAP::ValueArg<float> arg_homofreq("", "min_homo_af", "Threshold on allele frequency (0-1). ", false, 0.8, "float", cmd);
@@ -107,9 +109,9 @@ void read_parameters(int argc, char *argv[]) {
std::stringstream usage;
usage << "" << std::endl;
usage << "Usage: sniffles [options] -m <sorted.bam> -v <output.vcf> " << std::endl;
- usage << "Version: "<<Parameter::Instance()->version << std::endl;
+ usage << "Version: " << Parameter::Instance()->version << std::endl;
usage << "Contact: fritz.sedlazeck at gmail.com" << std::endl;
- usage << std::endl;
+ usage << std::endl;
usage << "Input/Output:" << std::endl;
printParameter<std::string>(usage, arg_bamfile);
@@ -129,7 +131,7 @@ void read_parameters(int argc, char *argv[]) {
printParameter<int>(usage, arg_numreads);
printParameter<int>(usage, arg_segsize);
printParameter<int>(usage, arg_zmw);
- printParameter(usage,arg_cs_string);
+ printParameter(usage, arg_cs_string);
usage << "" << std::endl;
usage << "Clustering/phasing and genotyping:" << std::endl;
@@ -144,9 +146,11 @@ void read_parameters(int argc, char *argv[]) {
usage << "Advanced:" << std::endl;
printParameter(usage, arg_bnd);
printParameter(usage, arg_seq);
+ printParameter(usage,arg_seq_old);
printParameter(usage, arg_std);
- printParameter(usage,arg_read_strand);
- printParameter(usage,arg_ccs);
+ //printParameter(usage, arg_read_strand);
+ printParameter(usage, arg_ccs);
+ printParameter(usage, arg_str);
usage << "" << std::endl;
usage << "Parameter estimation:" << std::endl;
@@ -181,9 +185,9 @@ void read_parameters(int argc, char *argv[]) {
cmd.parse(argc, argv);
Parameter::Instance()->change_coords = arg_coords.getValue();
- Parameter::Instance()->debug = true;
+ //Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
- Parameter::Instance()->read_name = " ";//m54238_180925_225123/56099701/ccs";//m54238_180926_231301/43516780/ccs"; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
+ Parameter::Instance()->read_name =arg_readname.getValue(); //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
@@ -192,7 +196,7 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->max_splits = arg_splits.getValue();
Parameter::Instance()->max_dist = arg_dist.getValue();
Parameter::Instance()->min_length = arg_minlength.getValue();
- Parameter::Instance()->genotype = arg_genotype.getValue();
+ Parameter::Instance()->genotype = true;//arg_genotype.getValue();
Parameter::Instance()->phase = arg_cluster.getValue();
Parameter::Instance()->num_threads = arg_threads.getValue();
Parameter::Instance()->output_bedpe = arg_bedpe.getValue();
@@ -202,30 +206,35 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->min_segment_size = arg_segsize.getValue();
Parameter::Instance()->reportBND = arg_bnd.getValue();
Parameter::Instance()->input_vcf = arg_input_vcf.getValue();
- Parameter::Instance()->print_seq = true;//arg_seq.getValue();
+ Parameter::Instance()->print_seq = !arg_seq.getValue();
Parameter::Instance()->ignore_std = arg_std.getValue();
Parameter::Instance()->min_zmw = arg_zmw.getValue();
Parameter::Instance()->homfreq = arg_homofreq.getValue();
Parameter::Instance()->hetfreq = arg_hetfreq.getValue();
Parameter::Instance()->skip_parameter_estimation = arg_parameter.getValue();
Parameter::Instance()->cs_string = arg_cs_string.getValue();
- Parameter::Instance()->read_strand=arg_read_strand.getValue();
- Parameter::Instance()->ccs_reads=arg_ccs.getValue();
+ Parameter::Instance()->read_strand = true;// arg_read_strand.getValue();
+ Parameter::Instance()->ccs_reads = arg_ccs.getValue();
+ Parameter::Instance()->str = arg_str.getValue();
- if(Parameter::Instance()->ccs_reads){
- Parameter::Instance()->skip_parameter_estimation=true;
- Parameter::Instance()->ignore_std=false;
+ if (Parameter::Instance()->ccs_reads) {
+ Parameter::Instance()->skip_parameter_estimation = true;
+ Parameter::Instance()->ignore_std = false;
}
if (Parameter::Instance()->skip_parameter_estimation) {
- cout<<"\tSkip parameter estimation."<<endl;
+ cout << "\tSkip parameter estimation." << endl;
Parameter::Instance()->score_treshold = 2;
Parameter::Instance()->window_thresh = arg_parameter_maxdiff.getValue();
Parameter::Instance()->max_dist_alns = arg_parameter_maxdist.getValue();
- Parameter::Instance()->avg_del =arg_delratio.getValue();
+ Parameter::Instance()->avg_del = arg_delratio.getValue();
Parameter::Instance()->avg_ins = arg_insratio.getValue();
}
+ if(!Parameter::Instance()->input_vcf.empty()){
+ cout<<"\tForce calling mode enabled. Setting parameter accordingly"<<endl;
+ Parameter::Instance()->min_mq=1;
+ }
//Parse IDS:
/*std::string buffer = arg_chrs.getValue();
@@ -265,184 +274,24 @@ void read_parameters(int argc, char *argv[]) {
//should I check tmp file path??
}
-//some toy/test functions:
-void parse_binary() {
- std::string tmp_name_file = Parameter::Instance()->tmp_file; // this file is created in IPrinter and stores the names and ID of SVS.
- tmp_name_file += "Names";
-
- FILE * alt_allel_reads = fopen(tmp_name_file.c_str(), "r");
- if (alt_allel_reads == NULL) {
- std::cerr << "ClusterParse: could not open tmp file: " << tmp_name_file.c_str() << std::endl;
- }
- std::cout << "start" << std::endl;
- name_str tmp;
- size_t nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads);
- std::cout << tmp.read_name << std::endl;
- while (nbytes != 0) {
- int max_ID = std::max(max_ID, tmp.svs_id);
-
- if (tmp.svs_id == 34 || tmp.svs_id == 35) {
- std::cout << "Cluster: " << tmp.svs_id << " " << tmp.read_name << std::endl;
- }
- // std::cout << tmp.read_name << std::endl;
- nbytes = fread(&tmp, sizeof(struct name_str), 1, alt_allel_reads);
- }
- fclose(alt_allel_reads);
-}
-
-double comp_std(std::vector<int> pos, int start) {
- double count = 0;
- double std_start = 0;
-
- for (size_t i = 0; i < pos.size(); i++) {
- count++;
- if (pos[i] != -1) {
- long diff = (start - pos[i]);
- // std::cout << "DIFF Start: " << diff << std::endl;
- std_start += std::pow((double) diff, 2.0);
- }
- }
- return std::sqrt(std_start / count);
-}
-
-void test_sort_insert(int pos, std::vector<int> & positions) {
-
- size_t i = 0;
- while (i < positions.size() && positions[i] < pos) {
- i++;
- }
- positions.insert(positions.begin() + i, pos);
-
-}
-
-double test_comp_std_quantile(std::vector<int> positions, int position) {
- double count = 0;
- std::vector<int> std_start_dists;
- double std_start = 0;
-
- for (std::vector<int>::iterator i = positions.begin(); i != positions.end(); i++) {
-
- long diff = (position - (*i));
- // std::cout << "DIFF Start: " << diff << std::endl;
- test_sort_insert(std::pow((double) diff, 2.0), std_start_dists);
- //std_start += std::pow((double) diff, 2.0);
-
- }
-
- count = 0;
- for (size_t i = 0; i < std_start_dists.size() / 2; i++) {
- std_start += std_start_dists[i];
- count++;
- }
-
- return std::sqrt(std_start / count);
-}
-void test_std() {
- srand(time(NULL));
- int start = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support.
- std::vector<int> positions;
- double avg = 0;
- double num = 0;
-
- for (int border = 100; border < 9001; border = border * 10) {
- for (int t = 0; t < 10; t++) {
- for (int cov = 2; cov < 5; cov += 1) {
-
- for (size_t i = 0; i < cov; i++) {
- int pos = (rand() % border) + (start - (border / 2));
- positions.push_back(pos);
- }
- avg += comp_std(positions, start) / test_comp_std_quantile(positions, start);
- std::cout << "Cov: " << cov + 1 << " border: " << border << " STD: " << comp_std(positions, start) << std::endl; // / test_comp_std_quantile(positions, start) << std::endl;
- positions.clear();
- num++;
- }
- }
- }
- std::cout << "AVG: " << avg / num << std::endl;
-}
-
-void get_rand(int mean, int num, vector<int> & positions, int interval) {
-//std::cout << "sim " << num << std::endl;
- for (size_t i = 0; i < num; i++) {
- int pos = (rand() % interval) + (mean - (interval / 2));
- positions.push_back(pos);
- }
-}
-#include <stdlib.h>
-std::vector<int> sort_distance(std::vector<int> positions, int mean) {
- std::vector<int> distances;
- for (size_t i = 0; i < positions.size(); i++) {
- int dist = std::abs(mean - positions[i]);
- size_t j = 0;
- while (j < distances.size()) {
- if (std::abs(mean - distances[j]) < dist) {
- distances.insert(distances.begin() + j, positions[i]);
- break;
- }
- j++;
- }
- if (j == distances.size()) {
- distances.push_back(positions[i]);
- }
- }
- return distances;
-}
-void test_slimming() {
- double fract = 0.2;
- srand(time(NULL));
- int mean = rand() % 100000; /// sqrt(1/12) for ins. Plot TRA std vs. cov/support.
- int intervall = 1000;
-
- std::vector<std::vector<double> > stds;
- int key = 0;
- int cov = 100;
- for (double fract = 0.1; fract < 1; fract += 0.1) {
-
- //std::cout<<fract<<std::endl;
- std::vector<int> positions;
- get_rand(mean, round(cov * fract), positions, intervall); //random process
- get_rand(mean, round(cov * (1 - fract)), positions, 10); //focused calls
- // std::cout << "Cov: " << cov << " border: " << intervall << " STD: " << comp_std(positions, mean) << std::endl;
- std::vector<int> dists;
- dists = sort_distance(positions, mean);
-
- /* for (size_t i = 0; i < dists.size(); i++) {
- std::cout << abs(mean - dists[i]) << std::endl;
- }
- */
- std::vector<double> std_tmp;
- for (size_t i = 0; i < dists.size(); i++) {
- std::vector<int> tmp;
- tmp.assign(dists.rbegin(), dists.rend() - i);
- double std = comp_std(tmp, mean);
- //std::cout << "Points: " << tmp.size() << " STD: " << std << std::endl;
- std_tmp.push_back(std);
- }
- stds.push_back(std_tmp);
- }
- for (size_t i = 0; i < stds.size(); i++) {
- for (size_t j = 0; j < stds[i].size(); j++) {
- std::cout << stds[i][j] << "\t";
- }
- std::cout << std::endl;
- }
-}
int main(int argc, char *argv[]) {
try {
-
+ Parameter::Instance()->testing = true;
//init parameter and reads user defined parameter from command line.
read_parameters(argc, argv);
+
+ // Parameter::Instance()->read_name = "51bda775-ff02-44bb-918b-5193f2c0adce" ;
+
//init openmp:
omp_set_dynamic(0);
omp_set_num_threads(Parameter::Instance()->num_threads);
if ((!Parameter::Instance()->output_vcf.empty()) && (!Parameter::Instance()->output_bedpe.empty())) {
std::cerr << "Please select only vcf OR bedpe output format!" << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
//init printer:
IPrinter * printer;
@@ -477,12 +326,18 @@ int main(int argc, char *argv[]) {
if (Parameter::Instance()->genotype) {
std::cout << "Start genotype calling:" << std::endl;
Genotyper * go = new Genotyper();
- go->update_SVs();
+ // if (!Parameter::Instance()->testing) {
+
+ go->update_SVs2();
+ //} else {
+ // go->update_SVs();
+ // }
}
} catch (TCLAP::ArgException &e) // catch any exceptions
{
std::cerr << "Sniffles error: " << e.error() << " for arg " << e.argId() << std::endl;
+ return -1;
}
return 0;
}
=====================================
src/cluster/Cluster_SVs.cpp
=====================================
@@ -40,7 +40,7 @@ void Cluster_SVS::update_SVs(std::vector<combine_str> & ids) {
myfile.open(filename.c_str(), std::ifstream::in);
if (!myfile.good()) {
std::cout << "Cluster Parse: could not open file: " << std::endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
std::string tmp_name_file = filename;
@@ -104,7 +104,7 @@ void Cluster_SVS::add_id(int curr_id, int new_id, std::vector<combine_str> & ids
std::string Cluster_SVS::find_id(int curr_id, std::vector<combine_str> & ids) {
std::stringstream ss;
for (size_t i = 0; i < ids.size(); i++) {
- if (ids[i].support > Parameter::Instance()->min_grouping_support) {
+ if (ids[i].support >= Parameter::Instance()->min_grouping_support) {
if (ids[i].curr_id == curr_id) {
ss << ids[i].alt_id;
=====================================
src/force_calling/Force_calling.cpp
=====================================
@@ -13,7 +13,7 @@ char assign_type(short type) {
case 0: //DEL
return DEL;
case 1: //DUP
- return INV;
+ return DUP;
case 2: //INV
return INV;
case 3: //TRA
@@ -46,11 +46,11 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
if (ref_lens.find(entries[i].start.chr) == ref_lens.end()) { // check why this is not called!
cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].start.chr << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
if (ref_lens.find(entries[i].stop.chr) == ref_lens.end()) {
cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].stop.chr << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
@@ -80,15 +80,17 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
invalid_svs++;
}
}
- cerr << "\tInvalid types found skipping " << invalid_svs << " entries." << endl;
+ cerr << "\t\tInvalid types found skipping " << invalid_svs << " entries." << endl;
//std::cout << "Print:" << std::endl;
- //final.print(root_final);
+// final.print(root_final);
entries.clear();
//exit(0);
}
void force_calling(std::string bam_file, IPrinter *& printer) {
- cout << "Force calling SVs" << endl;
+ cout << "Force calling SVs resetting parameters" << endl;
+ Parameter::Instance()->min_mq = 0;
+
//parse reads
//only process reads overlapping SV
estimate_parameters(Parameter::Instance()->bam_files[0]);
@@ -100,9 +102,9 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
ref = mapped_file->get_refInfo();
} else {
cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
- std::cout << "Construct Tree..." << std::endl;
+ std::cout << "\tConstruct Tree..." << std::endl;
//construct the tree:
IntervallTree final;
@@ -116,77 +118,80 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
//FILE * alt_allel_reads;
FILE * ref_allel_reads;
if (Parameter::Instance()->genotype) {
- ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "w");
+// //ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
}
- Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
+ Alignment * tmp_aln = mapped_file->parseRead(0);
long ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName];
long num_reads = 0;
while (!tmp_aln->getQueryBases().empty()) {
- if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) {
+ if ((tmp_aln->getAlignment()->IsPrimaryAlignment())) { //&& (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) { //TODO disabled for now.
//change CHR:
if (current_RefID != tmp_aln->getRefID()) {
current_RefID = tmp_aln->getRefID();
ref_space = ref_lens[ref[tmp_aln->getRefID()].RefName];
std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength
}
+ if(strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0){
+ cout<<"Found read! "<<endl;
+ }
//check if overlap with any breakpoint!!
long read_start_pos = (long) tmp_aln->getPosition() - (long) Parameter::Instance()->max_dist;
read_start_pos += ref_space;
long read_stop_pos = read_start_pos + (long) tmp_aln->getAlignment()->Length + (long) Parameter::Instance()->max_dist; //getRefLength();//(long) tmp_aln->getPosition();
- if (final.overlaps(read_start_pos, read_stop_pos, root_final)) {
+ // cout<<"Check overlap: "<<read_start_pos<<" "<<read_stop_pos<<endl;;
+ //if (final.overlaps(read_start_pos, read_stop_pos, root_final)) {
+ // cout<<" found "<<endl;
//SCAN read:
std::vector<str_event> aln_event;
std::vector<aln_str> split_events;
- if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) {
- double score = tmp_aln->get_scrore_ratio();
+ double score = tmp_aln->get_scrore_ratio();
#pragma omp parallel // starts a new team
- {
+ {
#pragma omp sections
+ {
{
- {
- // clock_t begin = clock();
- if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
- aln_event = tmp_aln->get_events_Aln();
- }
- // Parameter::Instance()->meassure_time(begin, " Alignment ");
+ // clock_t begin = clock();
+ if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
+ aln_event = tmp_aln->get_events_Aln();
}
+ // Parameter::Instance()->meassure_time(begin, " Alignment ");
+ }
#pragma omp section
- {
- // clock_t begin_split = clock();
- split_events = tmp_aln->getSA(ref);
- // Parameter::Instance()->meassure_time(begin_split," Split reads ");
- }
+ {
+ // clock_t begin_split = clock();
+ split_events = tmp_aln->getSA(ref);
+ // Parameter::Instance()->meassure_time(begin_split," Split reads ");
}
}
- //tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
-
- //Store reference supporting reads for genotype estimation:
- str_read tmp;
- bool SV_support = !(aln_event.empty() && split_events.empty());
- if ((Parameter::Instance()->genotype && !SV_support) && (score == -1 || score > Parameter::Instance()->score_treshold)) {
- //write read:
- //cout<<"REf: "<<tmp_aln->getName()<<" "<<tmp_aln->getPosition()<<" "<<tmp_aln->getRefLength()<<endl;
- tmp.chr_id = tmp_aln->getRefID();
- tmp.start = tmp_aln->getPosition();
- tmp.length = tmp_aln->getRefLength();
- fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
- }
+ }
+ //tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
- //store the potential SVs:
- if (!aln_event.empty()) {
- add_events(tmp_aln, aln_event, 0, ref_space, final, root_final, num_reads, true);
- }
- if (!split_events.empty()) {
- add_splits(tmp_aln, split_events, 1, ref, final, root_final, num_reads, true);
- }
+ //Store reference supporting reads for genotype estimation:
+ // str_read tmp;
+ if ((Parameter::Instance()->genotype && (aln_event.empty() && split_events.empty()))) { //}&& (score == -1 || score > Parameter::Instance()->score_treshold)))) {
+ //write read:
+ write_read(tmp_aln, ref_allel_reads);
}
+
+ //store the potential SVs:
+ if (!aln_event.empty()) {
+ // cout<<"\t adding aln: "<<endl;
+ add_events(tmp_aln, aln_event, 0, ref_space, final, root_final, num_reads, true);
+ }
+ if (!split_events.empty()) {
+ // cout<<"\t adding split: "<<endl;
+ add_splits(tmp_aln, split_events, 1, ref, final, root_final, num_reads, true);
+ }
+
}
- }
+ // cout<<" none "<<endl;
+ // }
//get next read:
- mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
+ mapped_file->parseReadFast(0, tmp_aln);
num_reads++;
=====================================
src/force_calling/VCF_parser.cpp
=====================================
@@ -283,13 +283,16 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
myfile.open(filename.c_str(), std::ifstream::in);
if (!myfile.good()) {
std::cout << "VCF Parser: could not open file: " << filename.c_str() << std::endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
std::vector<strvcfentry> calls;
//myfile.getline(buffer, buffer_size);
getline(myfile, buffer);
int num = 0;
+
+ int num_dup=0;
+
while (!myfile.eof()) {
if (buffer[0] != '#') {
// std::cout<<num<<"\t"<<buffer<<std::endl;
@@ -379,6 +382,9 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
if (count == 4 && (tmp.type == -1 && buffer[i - 1] == '<')) {
tmp.type = get_type(std::string(&buffer[i]));
+ if(tmp.type==1){
+ num_dup++;
+ }
}
if (tmp.stop.pos == -1 && (count == 4 && (buffer[i - 1] == '[' || buffer[i - 1] == ']'))) {
tmp.stop = parse_pos(&buffer[i - 1]);
@@ -448,6 +454,6 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
getline(myfile, buffer);
}
myfile.close();
-//std::cout << calls.size() << std::endl;
+ //std::cout << calls.size()<<" DUPS: " <<num_dup << std::endl;
return calls;
}
=====================================
src/print/BedpePrinter.cpp
=====================================
@@ -28,36 +28,40 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
std::string chr;
std::string strands = SV->get_strand(2);
- int pos = IPrinter::calc_pos(SV->get_coordinates().start.min_pos, ref, chr);
-
+ int start_pos = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
//start coordinates:
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
- fprintf(file, "%i", pos);
+ fprintf(file, "%i", start_pos);
fprintf(file, "%c", '\t');
- fprintf(file, "%i", IPrinter::calc_pos(SV->get_coordinates().start.max_pos, ref, chr));
+ fprintf(file, "%i", start_pos+1);
fprintf(file, "%c", '\t');
//stop coordinates
- string chr_start;
- int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr_start);
+ string chr_stop;
+ int stop_pos = IPrinter::calc_pos(SV->get_coordinates().stop.most_support, ref, chr_stop);
- long end_coord = SV->get_coordinates().stop.min_pos;
- if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
- end_coord = std::max((SV->get_coordinates().stop.min_pos -(long)SV->get_length()), (long)start);
+ if(SV->get_SVtype()& INS){
+ stop_pos=std::max(stop_pos-(int)SV->get_length(),start_pos);
+ }
+
+ /* long end_coord = SV->get_coordinates().stop.most_support;
+ if (((SV->get_SVtype() & INS))){// && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
+ end_coord = std::max((SV->get_coordinates().stop.most_support -(long)SV->get_length()), (long)start);
}
pos = IPrinter::calc_pos(end_coord, ref, chr);
- fprintf(file, "%s", chr.c_str());
+*/
+ fprintf(file, "%s", chr_stop.c_str());
fprintf(file, "%c", '\t');
- fprintf(file, "%i", pos);
+ fprintf(file, "%i", stop_pos);
fprintf(file, "%c", '\t');
- end_coord = SV->get_coordinates().stop.max_pos;
+ /*end_coord = SV->get_coordinates().stop.most_support;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
- end_coord = std::max((SV->get_coordinates().stop.max_pos - (long) SV->get_length()), (long)start);
- }
- fprintf(file, "%i", IPrinter::calc_pos(end_coord, ref, chr));
+ end_coord = std::max((SV->get_coordinates().stop.most_support - (long) SV->get_length()), (long)start);
+ }*/
+ fprintf(file, "%i", stop_pos+1);
fprintf(file, "%c", '\t');
fprintf(file, "%i", id);
id++;
@@ -72,25 +76,25 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%c", '\t');
fprintf(file, "%i", SV->get_support());
fprintf(file, "%c", '\t');
- fprintf(file, "%s", chr_start.c_str());
+ fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
- fprintf(file, "%i", start);
+ fprintf(file, "%i", start_pos);
fprintf(file, "%c", '\t');
-
+/*
end_coord = SV->get_coordinates().stop.most_support;
- if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
+ if (((SV->get_SVtype() & INS) )){//&& SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord = std::max((SV->get_coordinates().stop.most_support - (long) SV->get_length()), (long)start);
}
- pos = IPrinter::calc_pos(end_coord, ref, chr);
- fprintf(file, "%s", chr.c_str());
+ pos = IPrinter::calc_pos(end_coord, ref, chr);*/
+ fprintf(file, "%s", chr_stop.c_str());
fprintf(file, "%c", '\t');
- fprintf(file, "%i", pos);
+ fprintf(file, "%i", stop_pos);
fprintf(file, "%c", '\t');
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {//!
- fprintf(file, "%s", "NA");
+ fprintf(file, "%i", 999999999);
} else {
fprintf(file, "%i", SV->get_length());
}
=====================================
src/print/IPrinter.cpp
=====================================
@@ -7,6 +7,29 @@
#include "IPrinter.h"
+void write_read(Alignment * tmp_aln, FILE * & ref_allel_reads) {
+ /* tmp.chr_id = tmp_aln->getRefID(); //check string in binary???
+ tmp.start = tmp_aln->getPosition();
+ tmp.length = tmp_aln->getRefLength();
+ if (tmp_aln->getStrand()) {
+ tmp.strand = 1;
+ } else {
+ tmp.strand = 2;
+ }*/
+
+ fprintf(ref_allel_reads, "%i", tmp_aln->getRefID());
+ fprintf(ref_allel_reads, "%ref_plus", '\t');
+ fprintf(ref_allel_reads, "%i", tmp_aln->getPosition());
+ fprintf(ref_allel_reads, "%ref_plus", '\t');
+ fprintf(ref_allel_reads, "%i", tmp_aln->getRefLength());
+ fprintf(ref_allel_reads, "%ref_plus", '\t');
+ if (tmp_aln->getStrand()) {
+ fprintf(ref_allel_reads, "%ref_plus", '1');
+ } else {
+ fprintf(ref_allel_reads, "%ref_plus", '2');
+ }
+ fprintf(ref_allel_reads, "%ref_plus", '\n');
+}
std::string IPrinter::assess_genotype(int ref, int support) {
double allele = (double) support / (double) (support + ref);
@@ -20,10 +43,10 @@ std::string IPrinter::assess_genotype(int ref, int support) {
ss << allele;
ss << "\tGT:DR:DV\t";
if (allele > Parameter::Instance()->homfreq) {
- ss <<"1/1:";
+ ss << "1/1:";
} else if (allele > Parameter::Instance()->hetfreq) {
ss << "0/1:";
- }else{
+ } else {
ss << "0/0:";
}
ss << ref;
@@ -32,8 +55,6 @@ std::string IPrinter::assess_genotype(int ref, int support) {
return ss.str();
}
-
-
bool IPrinter::is_huge_ins(Breakpoint * &SV) {
int counts = 0;
std::map<std::string, read_str> support = SV->get_coordinates().support;
@@ -151,9 +172,9 @@ const std::string IPrinter::currentDateTime() {
struct tm tstruct;
char buf[80];
tstruct = *localtime(&now);
- // Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
+ // Visit http://en.cppreference.com/w/cpp/chrono/ref_plus/strftime
// for more information about date/time format
- strftime(buf, sizeof(buf), "%Y%m%d", &tstruct);
+ strftime(buf, sizeof(buf), "%Y%m%ref_minus", &tstruct);
return buf;
}
@@ -264,7 +285,6 @@ pair<double, double> IPrinter::comp_std_quantile(Breakpoint * &SV, pair<double,
std.first = std::sqrt(std.first / count);
std.second = std::sqrt(std.second / count);
-
s4_start = s4_start / count;
s4_stop = s4_stop / count;
s2_start = s2_start / count;
=====================================
src/print/IPrinter.h
=====================================
@@ -17,9 +17,11 @@
#include "../cluster/Cluster_SVs.h"
#include "../Genotyper/Genotyper.h"
#include <math.h>
+#include <cmath>
+#include <cstdlib>
double const uniform_variance = 0.2886751; //sqrt(1/12) see variance of uniform distribution -> std
-
+void write_read(Alignment * tmp_aln, FILE * & ref_allel_reads);
class IPrinter {
protected:
FILE *file;
@@ -57,7 +59,9 @@ public:
if(Parameter::Instance()->input_vcf.empty()){
print_body(SV, ref);
}else{
- print_body_recall(SV,ref);
+ //test
+ print_body(SV, ref);
+ //print_body_recall(SV,ref);
}
}
void init() {
@@ -98,6 +102,7 @@ public:
void comp_std_med(Breakpoint * &SV, double & std_start, double & std_stop);
pair<double, double> comp_std_quantile(Breakpoint * &SV, pair<double, double>& std, double & std_lenght, int & zmw_num);
const std::string currentDateTime();
+ double fisher_exact(int sv_plus, int sv_minus, int ref_plus, int ref_minus);
};
#endif /* PRINT_IPRINTER_H_ */
=====================================
src/print/VCFPrinter.cpp
=====================================
@@ -31,15 +31,16 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##ALT=<ID=INVDUP,Description=\"InvertedDUP with unknown boundaries\">\n");
fprintf(file, "%s", "##ALT=<ID=TRA,Description=\"Translocation\">\n");
fprintf(file, "%s", "##ALT=<ID=INS,Description=\"Insertion\">\n");
+ fprintf(file, "%s", "##FILTER=<ID=UNRESOLVED,Description=\"An insertion that is longer than the read and thus we cannot predict the full size.\">\n");
fprintf(file, "%s", "##INFO=<ID=CHR2,Number=1,Type=String,Description=\"Chromosome for END coordinate in case of a translocation\">\n");
fprintf(file, "%s", "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the structural variant\">\n");
fprintf(file, "%s", "##INFO=<ID=MAPQ,Number=1,Type=Integer,Description=\"Median mapping quality of paired-ends\">\n");
fprintf(file, "%s", "##INFO=<ID=RE,Number=1,Type=Integer,Description=\"read support\">\n");
fprintf(file, "%s", "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n");
fprintf(file, "%s", "##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n");
- fprintf(file, "%s", "##INFO=<ID=UNRESOLVED,Number=0,Type=Flag,Description=\"An insertion that is longer than the read and thus we cannot predict the full size.\">\n");
+ //##FILTER=<ID=LowQual,Description="PE/SR support below 3 or mapping quality below 20.">
fprintf(file, "%s", "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
- fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"Length of the SV\">\n");
+ //fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"Number of reads supporting per strand\">\n");
fprintf(file, "%s", "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
@@ -49,16 +50,17 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##INFO=<ID=SEQ,Number=1,Type=String,Description=\"Extracted sequence from the best representative read.\">\n");
}
- if (Parameter::Instance()->read_strand) {
+// if (Parameter::Instance()->read_strand) {
fprintf(file, "%s", "##INFO=<ID=STRANDS2,Number=4,Type=Integer,Description=\"alt reads first + ,alt reads first -,alt reads second + ,alt reads second -.\">\n");
- fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"plus strand ref, minus strand ref.\">\n");
- }
+ fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=.,Type=Integer,Description=\"plus strand ref, minus strand ref.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=Strandbias_pval,Number=A,Type=Float,Description=\"P-value for fisher exact test for strand bias.\">\n");
+// }
fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=A,Type=Float,Description=\"STD of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=A,Type=Float,Description=\"STD of the stop breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Float,Description=\"Kurtosis value of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Float,Description=\"Kurtosis value of the stop breakpoints across the reads.\">\n");
- fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=A,Type=String,Description=\"Type by which the variant is supported.(SR,ALN,NR)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=.,Type=String,Description=\"Type by which the variant is supported.(SR,AL,NR)\">\n");
fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=A,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
fprintf(file, "%s", "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency.\">\n");
fprintf(file, "%s", "##INFO=<ID=ZMW,Number=A,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
@@ -74,7 +76,153 @@ void VCFPrinter::print_header() {
fprintf(file, "%c", '\n');
}
+
+map<std::string, vector<int> > init_motives2() {
+ map<std::string, vector<int> > motives;\
+ motives["ATCT"].push_back(0); // = 0; //rev comp
+ motives["GTCT"].push_back(0);
+ motives["TCTA"].push_back(0); // = 0;
+ motives["GGAA"].push_back(0); // = 0;
+ motives["GGCA"].push_back(0); // = 0;
+ motives["AAGG"].push_back(0); // = 0;
+ motives["AGAA"].push_back(0); // = 0;
+ motives["CTAT"].push_back(0); // = 0;
+ motives["TGAA"].push_back(0); // = 0;
+ motives["GATA"].push_back(0); // = 0;
+ motives["GACA"].push_back(0); // = 0;
+ motives["TAGA"].push_back(0); // = 0;
+ motives["CAGA"].push_back(0); // = 0;
+ motives["TATC"].push_back(0); // = 0;
+ motives["TTTTC"].push_back(0); // = 0;
+ motives["GATA"].push_back(0); // = 0;
+ motives["TCCT"].push_back(0); // = 0;
+ motives["TATC"].push_back(0); // = 0;
+ motives["AAAGA"].push_back(0); // = 0;
+ motives["ATT"].push_back(0); // = 0;
+
+ motives["TAGA"].push_back(0); // = 0;
+ motives["GAAA"].push_back(0); // = 0;
+ motives["TCTG"].push_back(0); // = 0;
+ motives["TAT"].push_back(0); // = 0;
+ motives["AGAT"].push_back(0); // = 0;
+ motives["TGGA"].push_back(0); // = 0;
+ motives["TTTTC"].push_back(0); // = 0;
+ motives["GATA"].push_back(0); // = 0;
+ motives["AGAGAT"].push_back(0); // = 0;
+ motives["GAAA"].push_back(0); // = 0;
+ motives["CTT"].push_back(0); // = 0;
+ motives["ATCT"].push_back(0); // = 0;
+ motives["GATA"].push_back(0); // = 0;
+ motives["TTTC"].push_back(0); // = 0;
+ motives["AAAG"].push_back(0); // = 0;
+ motives["CTTTT"].push_back(0); // = 0;
+
+ return motives;
+}
+
+map<std::string, int> init_motives() {
+ map<std::string, int> motives;
+ motives["TGAA"] = 0;
+ motives["ATCT"] = 0; //rev comp
+ motives["GTCT"] = 0;
+ motives["GGAA"] = 0;
+ motives["GGCA"] = 0;
+ motives["AAGG"] = 0;
+ motives["AGAA"] = 0;
+ motives["AGAT"] = 0;
+ motives["CTAT"] = 0;
+ motives["TCTA"] = 0;
+ motives["GATA"] = 0;
+ motives["GACA"] = 0;
+ motives["TAGA"] = 0;
+ motives["CAGA"] = 0;
+ motives["TATC"] = 0;
+ motives["TTTTC"] = 0;
+ motives["GATA"] = 0;
+ motives["TCCT"] = 0;
+ motives["TATC"] = 0;
+ motives["AAAGA"] = 0;
+ motives["ATT"] = 0;
+
+ motives["TAGA"]= 0;
+ motives["GAAA"]= 0;
+ motives["TCTG"]= 0;
+ motives["TAT"]= 0;
+ motives["TGGA"]= 0;
+ motives["AGAGAT"]= 0;
+ motives["GAAA"]= 0;
+ motives["CTT"]= 0;
+ motives["ATCT"]= 0;
+ motives["GATA"]= 0;
+ motives["TTTC"]= 0;
+ motives["AAAG"]= 0;
+ motives["CTTTT"]= 0;
+ return motives;
+}
+
+void VCFPrinter::report_STR(Breakpoint * &SV, RefVector ref) {
+
+ //STR code!:::::
+ //=============
+
+ if (Parameter::Instance()->str && ((SV->get_SVtype() & INS) || (SV->get_SVtype() & DEL))) {
+ map<std::string, std::vector<int> > motives2 = init_motives2();
+ std::string chr;
+ int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
+ cout << "NEW REGION: " << chr << ":" << start << " " << IPrinter::get_type(SV->get_SVtype()) << endl;
+ std::map<std::string, read_str> support = SV->get_coordinates().support;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ if (abs(SV->get_coordinates().start.most_support - (*i).second.coordinates.first) < 2) { //10
+ map<std::string, int> motives = init_motives();
+ int counts = 0;
+ std::string sequence = (*i).second.sequence;
+ //cout<<"check seq"<<endl;
+ for (size_t p = 0; p < sequence.size(); p++) {
+ if (motives.find(sequence.substr(p, 3)) != motives.end()) {
+ motives[sequence.substr(p, 3)]++;
+ }
+ if (motives.find(sequence.substr(p, 4)) != motives.end()) {
+ motives[sequence.substr(p, 4)]++;
+ }
+ if (motives.find(sequence.substr(p, 5)) != motives.end()) {
+ motives[sequence.substr(p, 5)]++;
+ }
+ }
+ //cout<<"Summarize"<<endl;
+ for (map<std::string, int>::iterator p = motives.begin(); p != motives.end(); p++) {
+ if ((*p).second > 0) {
+ while ((*p).second + 1 > motives2[(*p).first].size()) {
+ motives2[(*p).first].push_back(0);
+ }
+ motives2[(*p).first][(*p).second]++;
+ }
+ }
+ }
+ }
+ //cout<<"Print"<<endl;
+ std::stringstream ss2;
+ int num_entries = 0;
+ for (map<std::string, vector<int> >::iterator p = motives2.begin(); p != motives2.end(); p++) {
+ //if ((*p).second.size() ) {
+ int max = 0;
+ std::stringstream ss;
+ ss << (*p).first << ":";
+ for (size_t i = 0; i < (*p).second.size(); i++) {
+ ss << (*p).second[i] << ";";
+ if ((*p).second[i] > max) {
+ max = (*p).second[i];
+ }
+ }
+ if (max > 1) {
+ cout << ss.str() << endl;
+ }
+ }
+ }
+
+}
+
void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
+
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
//temp. store read names supporting this SVs to later group the SVs together.
double std_quant_start = 0;
@@ -85,6 +233,13 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
double std_length = 0;
int zmws = 0;
bool ok_to_print = (to_print(SV, std_quant, kurtosis, std_length, zmws) || Parameter::Instance()->ignore_std);
+ if (Parameter::Instance()->str) {
+ report_STR(SV, ref);
+ }
+ if(!Parameter::Instance()->input_vcf.empty()){//TEST
+ ok_to_print=true;
+ Parameter::Instance()->min_zmw=0;
+ }
//std::cout << "Print check: " << std_quant.first << " " << std_quant.second << endl;
if (ok_to_print && (zmws == 0 || zmws >= Parameter::Instance()->min_zmw)) {
if (Parameter::Instance()->phase) {
@@ -94,17 +249,23 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
fprintf(file, "%s", chr.c_str());
fprintf(file, "%c", '\t');
- fprintf(file, "%i", start);
+ if (start < 1) {
+ start = 1;
+ }
+ fprintf(file, "%i", start+1);
fprintf(file, "%c", '\t');
fprintf(file, "%i", id);
id++;
long end_coord = SV->get_coordinates().stop.most_support;
- if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
+ if (((SV->get_SVtype() & INS))) { // && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
end_coord = std::max((SV->get_coordinates().stop.most_support - (long) SV->get_length()), (long) start);
}
- int end = IPrinter::calc_pos(end_coord, ref, chr);
+ int end = IPrinter::calc_pos(end_coord, ref, chr)+1;
+ if (end < 1) {
+ end = 1;
+ }
std::string strands = SV->get_strand(1);
if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
@@ -146,6 +307,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%c", '>');
}
+ // Check p-value and set tag for strand bias if RE support > 7 !
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
fprintf(file, "%s", "\t.\tUNRESOLVED\t");
} else {
@@ -161,7 +323,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", ";SVMETHOD=Snifflesv");
fprintf(file, "%s", Parameter::Instance()->version.c_str());
- if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) {
+ // if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) {
fprintf(file, "%s", ";CHR2=");
fprintf(file, "%s", chr.c_str());
@@ -173,7 +335,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%i", end);
}
- }
+ //}
if (zmws != 0) {
fprintf(file, "%s", ";ZMW=");
fprintf(file, "%i", zmws);
@@ -202,10 +364,14 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", SV->get_supporting_types().c_str());
fprintf(file, "%s", ";SVLEN=");
- if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) { //!
- fprintf(file, "%i", 999999999);
+ if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
+ if (SV->get_sequence().size() != 0) { //!
+ fprintf(file, "%i", SV->get_sequence().size());
+ } else {
+ fprintf(file, "%i", 1);
+ }
} else if (SV->get_SVtype() & TRA) {
- fprintf(file, "%i", 0);
+ fprintf(file, "%i", 1);
} else if (SV->get_SVtype() & DEL) {
fprintf(file, "%i", SV->get_length() * -1);
} else {
@@ -214,7 +380,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
// }
fprintf(file, "%s", ";STRANDS=");
fprintf(file, "%s", strands.c_str());
- if (Parameter::Instance()->read_strand) {
+ // if (Parameter::Instance()->read_strand) {
fprintf(file, "%s", ";STRANDS2=");
std::map<std::string, read_str> support = SV->get_coordinates().support;
pair<int, int> tmp_start;
@@ -242,7 +408,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%i", tmp_stop.first);
fprintf(file, "%s", ",");
fprintf(file, "%i", tmp_stop.second);
- }
+ // }
// if (Parameter::Instance()->print_seq && !SV->get_sequence().empty()) {
// fprintf(file, "%s", ";SEQ=");
@@ -312,7 +478,7 @@ void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", "IMPRECISE");
fprintf(file, "%s", ";SVMETHOD=Snifflesv");
fprintf(file, "%s", Parameter::Instance()->version.c_str());
- if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) {
+// if (!(Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA))) {
fprintf(file, "%s", ";CHR2=");
fprintf(file, "%s", chr.c_str());
fprintf(file, "%s", ";END=");
@@ -322,7 +488,7 @@ void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
} else {
fprintf(file, "%i", end);
}
- }
+// }
fprintf(file, "%s", ";SVTYPE=");
if (Parameter::Instance()->reportBND && (SV->get_SVtype() & TRA)) {
=====================================
src/print/VCFPrinter.h
=====================================
@@ -15,6 +15,7 @@ private:
void print_header();
void print_body(Breakpoint * &SV, RefVector ref);
void print_body_recall(Breakpoint * &SV, RefVector ref);
+ void report_STR(Breakpoint * &SV, RefVector ref);
public:
VCFPrinter(){
=====================================
src/sub/Breakpoint.cpp
=====================================
@@ -103,7 +103,7 @@ int get_dist(Breakpoint * tmp) {
long Breakpoint::overlap(Breakpoint * tmp) {
bool flag = false;
-//flag = ((*tmp->get_coordinates().support.begin()).second.SV & DEL);
+ //flag = ((*tmp->get_coordinates().support.begin()).second.SV & INS);
int max_dist = std::min(get_dist(tmp), get_dist(this)); // Parameter::Instance()->max_dist
if (flag) {
std::cout << "\t Overlap: " << max_dist << " start: " << tmp->get_coordinates().start.min_pos << " " << positions.start.min_pos << " stop :" << tmp->get_coordinates().stop.max_pos << " " << positions.stop.max_pos;
@@ -133,7 +133,7 @@ long Breakpoint::overlap(Breakpoint * tmp) {
//Standard merging.
if (is_same_strand(tmp) && (abs(tmp->get_coordinates().start.min_pos - positions.start.min_pos) < max_dist && abs(tmp->get_coordinates().stop.max_pos - positions.stop.max_pos) < max_dist)) {
if (flag) {
- cout << "\tHIT" << endl;
+ cout << "\tMERGED" << endl;
}
return 0;
}
@@ -292,9 +292,11 @@ void Breakpoint::calc_support() {
if (get_SVtype() & TRA) { // we cannot make assumptions abut support yet.
set_valid((bool) (get_support() >= 1)); // this is needed as we take each chr independently and just look at the primary alignment
- } else if (get_support() >= Parameter::Instance()->min_support) {
+ } else if (get_support() >= min(3, Parameter::Instance()->min_support)) { // to allow the merge of split SV
predict_SV();
- set_valid((bool) (get_length() > Parameter::Instance()->min_length));
+ if (get_support() >= Parameter::Instance()->min_support) {
+ set_valid((bool) (get_length() > Parameter::Instance()->min_length));
+ }
}
}
@@ -425,13 +427,19 @@ void Breakpoint::predict_SV() {
long counts = 0;
int maxim = 0;
long coord = 0;
+ long min_start = 0;
+ long max_stop = 0;
if (num > 0) {
+ min_start = (*starts.begin()).first;
//find the start coordinate:
for (map<long, int>::iterator i = starts.begin(); i != starts.end(); i++) {
if ((*i).second > maxim) {
coord = (*i).first;
maxim = (*i).second;
}
+ if ((*i).first < min_start) {
+ min_start = (*i).first;
+ }
}
if (maxim < 5) {
this->positions.start.most_support = get_median(start2); //check if "input"!
@@ -450,6 +458,9 @@ void Breakpoint::predict_SV() {
coord = (*i).first;
maxim = (*i).second;
}
+ if ((*i).first > max_stop) {
+ max_stop = (*i).first;
+ }
}
if (maxim < 5) {
this->positions.stop.most_support = get_median(stops2); // mean / counts;
@@ -483,6 +494,7 @@ void Breakpoint::predict_SV() {
}
starts.clear();
stops.clear();
+
//strand information:
for (size_t i = 0; i < strands.size(); i++) {
maxim = 0;
@@ -503,17 +515,17 @@ void Breakpoint::predict_SV() {
}
if (!scan_reads) {
- // std::cout<<"Test strand"<<std::endl;
+ // std::cout<<"Test strand"<<std::endl;
//if forcecalling we need to define the strands:
std::map<std::string, read_str>::iterator it = positions.support.find("input");
//no output!??
/*if ((*it).second.strand.first) {
- std::cout << "s1: true" << endl;
- }
- if ((*it).second.strand.second) {
- std::cout << "s2: true" << endl;
- }*/
+ std::cout << "s1: true" << endl;
+ }
+ if ((*it).second.strand.second) {
+ std::cout << "s2: true" << endl;
+ }*/
if (it != positions.support.end()) {
this->strand.push_back(translate_strand((*it).second.strand));
}
@@ -703,3 +715,16 @@ std::string Breakpoint::to_string(RefVector ref) {
return ss.str();
}
+void Breakpoint::integrate(Breakpoint * p) {
+ //carry over support:
+ position_str old = p->get_coordinates();
+ for (std::map<std::string, read_str>::iterator i = old.support.begin(); i != old.support.end(); i++) {
+ this->positions.support[(*i).first] = (*i).second;
+ // cout << "Added " << (*i).first << endl;
+ }
+
+// cout << "Recompute: " << endl;
+ this->calc_support();
+ set_valid((bool) (get_length() > Parameter::Instance()->min_length));
+}
+
=====================================
src/sub/Breakpoint.h
=====================================
@@ -53,7 +53,6 @@ struct position_str {
svs_breakpoint_str stop;
//int pos; //the chromosomes are encoded over the positions.
std::map<std::string,read_str> support;
- //std::vector<read_str> support; // map?? -> no duplicated reads, easy to catch up which read is included.
int coverage;
int lowmq_cov;
int read_start;
@@ -129,6 +128,8 @@ public:
}
+ void integrate(Breakpoint * p);
+
int get_support();
long overlap(Breakpoint * tmp);
long overlap_breakpoint(long start,long stop);
@@ -186,6 +187,7 @@ public:
void set_valid(bool valid){
this-> should_be_stored=valid;
}
+
bool get_valid(){
return this->should_be_stored;
}
=====================================
src/sub/Detect_Breakpoints.cpp
=====================================
@@ -85,7 +85,7 @@ void detect_merged_svs(position_str point, RefVector ref, vector<Breakpoint *> &
}
}
if (stop_count > 1 || start_count > 1) {
- std::cout << "\tprocessing merged TRA" << std::endl;
+ // std::cout << "\tprocessing merged TRA" << std::endl;
if (start_count > 1) {
new_points.push_back(split_points(pos_start[0].names, point.support));
new_points.push_back(split_points(pos_start[1].names, point.support));
@@ -171,6 +171,47 @@ void polish_points(std::vector<Breakpoint *> & points, RefVector ref) { //TODO m
}
}
+std::string TRANS_type2(char type) {
+ string tmp;
+ if (type & DEL) {
+ tmp += "DEL";
+ }
+ if (type & INV) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "INV";
+ }
+ if (type & DUP) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "DUP";
+ }
+ if (type & INS) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "INS";
+ }
+ if (type & TRA) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "TRA";
+ }
+ if (type & NEST) {
+ if (!tmp.empty()) {
+ tmp += '/';
+ }
+ tmp += "NEST";
+ }
+ if (tmp.empty()) {
+ cout << "ERR?" << endl;
+ }
+ return tmp; // should not occur!
+}
+
void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
estimate_parameters(read_filename);
BamParser * mapped_file = 0;
@@ -180,7 +221,7 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
ref = mapped_file->get_refInfo();
} else {
cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
//Using PlaneSweep to comp coverage and iterate through reads:
//PlaneSweep * sweep = new PlaneSweep();
@@ -196,9 +237,10 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
//FILE * alt_allel_reads;
FILE * ref_allel_reads;
if (Parameter::Instance()->genotype) {
- ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "w");
+ // ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
}
- Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
+ Alignment * tmp_aln = mapped_file->parseRead((uint16_t) Parameter::Instance()->min_mq);
long ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
long num_reads = 0;
@@ -210,9 +252,9 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
while (!tmp_aln->getQueryBases().empty()) {
- if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) { // && (Parameter::Instance()->chr_names.empty() || Parameter::Instance()->chr_names.find(ref[tmp_aln->getRefID()].RefName) != Parameter::Instance()->chr_names.end())) {
+ if ((tmp_aln->getAlignment()->IsPrimaryAlignment() && tmp_aln->get_is_save()) && !(tmp_aln->getAlignment()->AlignmentFlag & 0x800)){
- //change CHR:
+ //change CHR:
if (current_RefID != tmp_aln->getRefID()) {
std::cout << "\tSwitch Chr " << ref[tmp_aln->getRefID()].RefName << std::endl; //" " << ref[tmp_aln->getRefID()].RefLength
@@ -228,8 +270,26 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
}*/
+ std::cout << "\t\tReassessing breakpoints" << std::endl;
for (int i = 0; i < points.size(); i++) {
points[i]->calc_support();
+ }
+ int i = 0;
+ int pos = 0;
+ while (i < points.size()) {
+ if (points[i]->get_support() > min(3, Parameter::Instance()->min_support)) {
+ int dist = min(int(points[i]->get_coordinates().stop.most_support - points[i]->get_coordinates().start.most_support), Parameter::Instance()->max_dist);
+ if (pos != i && abs(points[i]->get_coordinates().start.most_support - points[pos]->get_coordinates().start.most_support) < dist / 2 && abs(points[i]->get_coordinates().stop.most_support - points[pos]->get_coordinates().stop.most_support) < dist / 2 && points[i]->get_SVtype() == points[pos]->get_SVtype()) {
+ // cout << "potential hit: " << TRANS_type2(points[i]->get_SVtype()) << " " << TRANS_type2(points[i]->get_SVtype()) << " " << points[i]->get_coordinates().start.most_support << " " << points[pos]->get_coordinates().start.most_support << " " << points[i]->get_coordinates().stop.most_support << " " << points[pos]->get_coordinates().stop.most_support << "supp: " << points[i]->get_support() << " " << points[pos]->get_support() << endl;
+ points[i]->integrate(points[pos]);
+ points[pos]->set_valid(false);
+ }
+ pos = i;
+ }
+ i++;
+ }
+
+ for (int i = 0; i < points.size(); i++) {
if (points[i]->get_valid()) {
//invoke update over ref support!
if (points[i]->get_SVtype() & TRA) {
@@ -239,16 +299,21 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
}
+ std::cout << "\t\tContinue parsing" << std::endl;
bst.clear(root);
current_RefID = tmp_aln->getRefID();
ref_space = get_ref_lengths(tmp_aln->getRefID(), ref);
}
+ if (strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) {
+ cout << "Found read! " << endl;
+ }
+
//SCAN read:
std::vector<str_event> aln_event;
std::vector<aln_str> split_events;
if (tmp_aln->getMappingQual() > Parameter::Instance()->min_mq) {
- double score = tmp_aln->get_scrore_ratio();
+ //double score = tmp_aln->get_scrore_ratio();
//
@@ -259,36 +324,35 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
#pragma omp section
{
// clock_t begin = clock();
- if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
- aln_event = tmp_aln->get_events_Aln();
+ //if ((score == -1 || score > Parameter::Instance()->score_treshold)) {
+ aln_event = tmp_aln->get_events_Aln();
+ if (strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) {
+ cout << "Found read events: " << aln_event.size() << endl;
}
+
+ //}
// Parameter::Instance()->meassure_time(begin, " Alignment ");
}
#pragma omp section
{
// clock_t begin_split = clock();
split_events = tmp_aln->getSA(ref);
+ if (strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) {
+ cout << "Found read split events: " << split_events.size() << endl;
+ }
// Parameter::Instance()->meassure_time(begin_split, " Split reads ");
}
}
}
+
+ // bool flag = (strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+
//tmp_aln->set_supports_SV(aln_event.empty() && split_events.empty());
//Store reference supporting reads for genotype estimation:
-
- bool SV_support = (!aln_event.empty() && !split_events.empty());
- if (Parameter::Instance()->genotype && !SV_support) {
+ if (Parameter::Instance()->genotype && (aln_event.empty() && split_events.empty())) {
//write read:
- str_read tmp;
- tmp.chr_id = tmp_aln->getRefID(); //check string in binary???
- tmp.start = tmp_aln->getPosition();
- tmp.length = tmp_aln->getRefLength();
- if (tmp_aln->getStrand()) {
- tmp.strand = 1;
- } else {
- tmp.strand = 2;
- }
- fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
+ write_read(tmp_aln, ref_allel_reads);
}
//store the potential SVs:
@@ -296,12 +360,15 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
add_events(tmp_aln, aln_event, 0, ref_space, bst, root, num_reads, false);
}
if (!split_events.empty()) {
+ if (strcmp(tmp_aln->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0) {
+ cout << "Found read split events: " << split_events.size() << endl;
+ }
add_splits(tmp_aln, split_events, 1, ref, bst, root, num_reads, false);
}
}
}
//get next read:
- mapped_file->parseReadFast(Parameter::Instance()->min_mq, tmp_aln);
+ mapped_file->parseReadFast((uint16_t) Parameter::Instance()->min_mq, tmp_aln);
num_reads++;
@@ -310,21 +377,30 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
//filter and copy results:
- std::cout << "Finalizing .." << std::endl;
+ std::cout << "\tFinalizing .." << std::endl;
+// cout << "SUMARIZE 2 " << endl;
std::vector<Breakpoint *> points;
bst.get_breakpoints(root, points);
- /* if (Parameter::Instance()->genotype) {
- fclose(ref_allel_reads);
- go->update_SVs(points, ref_space);
- string del = "rm ";
- del += Parameter::Instance()->tmp_genotyp;
- del += "ref_allele";
- system(del.c_str());
- }*/
-
for (int i = 0; i < points.size(); i++) {
points[i]->calc_support();
+ }
+ int i = 0;
+ int pos = 0;
+ while (i < points.size()) {
+ if (points[i]->get_support() > 3) {
+ int dist = min(int(points[i]->get_coordinates().stop.most_support - points[i]->get_coordinates().start.most_support), Parameter::Instance()->max_dist);
+ if (pos != i && abs(points[i]->get_coordinates().start.most_support - points[pos]->get_coordinates().start.most_support) < dist / 2 && abs(points[i]->get_coordinates().stop.most_support - points[pos]->get_coordinates().stop.most_support) < dist / 2 && points[i]->get_SVtype() == points[pos]->get_SVtype()) {
+ // cout << "potential hit: " << TRANS_type2(points[i]->get_SVtype()) << " " << TRANS_type2(points[i]->get_SVtype()) << " " << points[i]->get_coordinates().start.most_support << " " << points[pos]->get_coordinates().start.most_support << " " << points[i]->get_coordinates().stop.most_support << " " << points[pos]->get_coordinates().stop.most_support << "supp: " << points[i]->get_support() << " " << points[pos]->get_support() << endl;
+ points[i]->integrate(points[pos]);
+ points[pos]->set_valid(false);
+ }
+ pos = i;
+ }
+ i++;
+ }
+
+ for (int i = 0; i < points.size(); i++) {
if (points[i]->get_valid()) {
//invoke update over ref support!
if (points[i]->get_SVtype() & TRA) {
@@ -334,6 +410,7 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
}
}
}
+
bst.clear(root);
points.clear();
final.get_breakpoints(root_final, points);
@@ -359,92 +436,99 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
printer->printSV(points[i]);
}
}
- //std::cout<<"Done"<<std::endl;
+ //std::cout<<"Done"<<std::endl;/
+ if (Parameter::Instance()->genotype) {
+ fclose(ref_allel_reads);
+ }
+ delete mapped_file;
}
void add_events(Alignment *& tmp, std::vector<str_event> events, short type, long ref_space, IntervallTree & bst, TNode *&root, long read_id, bool add) {
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
for (size_t i = 0; i < events.size(); i++) {
- // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
- position_str svs;
- read_str read;
- if (events[i].is_noise) {
- read.type = 2;
- } else {
- read.type = 0;
- }
- read.SV = events[i].type;
- read.sequence = events[i].sequence;
+ // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
+ position_str svs;
+ read_str read;
+ if (events[i].is_noise) {
+ read.type = 2;
+ } else {
+ read.type = 0;
+ }
+ read.SV = events[i].type;
+ read.sequence = events[i].sequence;
- if (flag) {
- std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl;
- }
- svs.start.min_pos = (long) events[i].pos + ref_space;
- svs.stop.max_pos = svs.start.min_pos + events[i].length;
+ if (flag) {
+ std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << events[i].pos + ref_space << " " << abs(events[i].length) << std::endl;
+ }
+ svs.start.min_pos = (long) events[i].pos + ref_space;
+ svs.stop.max_pos = svs.start.min_pos + events[i].length;
- if (tmp->getStrand()) {
- read.strand.first = (tmp->getStrand());
- read.strand.second = !(tmp->getStrand());
- } else {
- read.strand.first = !(tmp->getStrand());
- read.strand.second = (tmp->getStrand());
- }
- // start.support[0].read_start.min = events[i].read_pos;
+ if (tmp->getStrand()) {
+ read.strand.first = (tmp->getStrand());
+ read.strand.second = !(tmp->getStrand());
+ } else {
+ read.strand.first = !(tmp->getStrand());
+ read.strand.second = (tmp->getStrand());
+ }
+ // start.support[0].read_start.min = events[i].read_pos;
- read.read_strand.first = tmp->getStrand();
- read.read_strand.second = tmp->getStrand();
- if (flag) {
- std::cout << tmp->getName() << " " << tmp->getRefID() << " " << svs.start.min_pos << " " << svs.stop.max_pos << " " << svs.stop.max_pos - svs.start.min_pos << std::endl;
- }
+ read.read_strand.first = tmp->getStrand();
+ read.read_strand.second = tmp->getStrand();
+ if (flag) {
+ std::cout << tmp->getName() << " " << tmp->getRefID() << " " << svs.start.min_pos << " " << svs.stop.max_pos << " " << svs.stop.max_pos - svs.start.min_pos << std::endl;
+ }
- if (svs.start.min_pos > svs.stop.max_pos) {
- //can this actually happen?
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
- } else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
- }
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ //can this actually happen?
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
+ } else {
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
+ }
- svs.start.max_pos = svs.start.min_pos;
- svs.stop.min_pos = svs.stop.max_pos;
+ svs.start.max_pos = svs.start.min_pos;
+ svs.stop.min_pos = svs.stop.max_pos;
- if (svs.start.min_pos > svs.stop.max_pos) { //incase they are inverted
- svs_breakpoint_str pos = svs.start;
- svs.start = svs.stop;
- svs.stop = pos;
- pair<bool, bool> tmp = read.strand;
- read.strand.first = tmp.second;
- read.strand.second = tmp.first;
- }
+ if (svs.start.min_pos > svs.stop.max_pos) { //incase they are inverted
+ svs_breakpoint_str pos = svs.start;
+ svs.start = svs.stop;
+ svs.stop = pos;
+ pair<bool, bool> tmp = read.strand;
+ read.strand.first = tmp.second;
+ read.strand.second = tmp.first;
+ }
- //TODO: we might not need this:
- if (svs.start.min_pos > svs.stop.max_pos) {
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
- } else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
- }
+ //TODO: we might not need this:
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
+ } else {
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
+ }
- read.id = read_id;
- svs.support[tmp->getName()] = read;
- svs.support[tmp->getName()].length = events[i].length;
- Breakpoint * point = new Breakpoint(svs, events[i].length);
- if (add) {
- bst.insert_existant(point, root);
- } else {
- bst.insert(point, root);
- }
- //std::cout<<"Print:"<<std::endl;
- //bst.print(root);
+ // read.id = read_id;
+ svs.support[tmp->getName()] = read;
+ svs.support[tmp->getName()].length = events[i].length;
+ Breakpoint * point = new Breakpoint(svs, events[i].length);
+ if (add) {
+ bst.insert_existant(point, root);
+ } else {
+ bst.insert(point, root);
}
+ if (flag) {
+ cout << "DONE MERGE" << endl;
+ }
+ //std::cout<<"Print:"<<std::endl;
+ //bst.print(root);
+ }
// }
}
void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree& bst, TNode *&root, long read_id, bool add) {
- bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
+ bool flag = false; //(strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
if (flag) {
cout << "SPLIT: " << std::endl;
@@ -459,209 +543,211 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
}
for (size_t i = 1; i < events.size(); i++) {
- // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
- position_str svs;
- //position_str stop;
- read_str read;
- read.sequence = "NA";
- //read.name = tmp->getName();
- read.type = type;
- read.SV = 0;
- read.read_strand.first = events[i - 1].strand;
- read.read_strand.second = events[i].strand;
-
- //stop.support.push_back(read);
- if (events[i].RefID == events[i - 1].RefID) { //IF different chr -> tra
- if (events[i - 1].strand == events[i].strand) { //IF same strand -> del/ins/dup
- if (events[i - 1].strand) {
- read.strand.first = events[i - 1].strand;
- read.strand.second = !events[i].strand;
- } else {
- read.strand.first = !events[i - 1].strand;
- read.strand.second = events[i].strand;
- }
- // int len1 = 0;
- //int len2 = 0;
- svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length;
- svs.read_stop = events[i].read_pos_start;
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
- svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- }
+ // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
+ position_str svs;
+ //position_str stop;
+ read_str read;
+ read.sequence = "NA";
+ //read.name = tmp->getName();
+ read.type = type;
+ read.SV = 0;
+ read.read_strand.first = events[i - 1].strand;
+ read.read_strand.second = events[i].strand;
+
+ //stop.support.push_back(read);
+ if (events[i].RefID == events[i - 1].RefID) { //IF different chr -> tra
+ if (events[i - 1].strand == events[i].strand) { //IF same strand -> del/ins/dup
+ if (events[i - 1].strand) {
+ read.strand.first = events[i - 1].strand;
+ read.strand.second = !events[i].strand;
+ } else {
+ read.strand.first = !events[i - 1].strand;
+ read.strand.second = events[i].strand;
+ }
+ // int len1 = 0;
+ //int len2 = 0;
+ svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length;
+ svs.read_stop = events[i].read_pos_start;
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ }
- if (flag) {
- cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " tmp: " << (svs.stop.max_pos - svs.start.min_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: " << svs.read_stop << std::endl;
- }
+ if (flag) {
+ cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " tmp: " << (svs.stop.max_pos - svs.start.min_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: " << svs.read_stop << std::endl;
+ }
- if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
- if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
- svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check!
- if (Parameter::Instance()->print_seq) {
- svs.read_stop = events[i].read_pos_start;
- svs.read_start = events[i - 1].read_pos_stop;
- if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) {
- cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
- }
- if (!events[i - 1].strand) {
- std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases);
-
- read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start));
- } else {
- read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start);
- }
- if (flag) {
- cout << "INS: " << endl;
- cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
- cout << "Seq+:" << read.sequence << endl;
- }
+ if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
+ if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
+ svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check!
+ if (Parameter::Instance()->print_seq) {
+ svs.read_stop = events[i].read_pos_start;
+ svs.read_start = events[i - 1].read_pos_stop;
+
+
+ if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) {
+ cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
}
- read.SV |= INS;
- } else {
- read.SV |= 'n';
- }
+ if (!events[i - 1].strand) {
+ std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases);
- } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
- if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
- read.SV |= DEL;
+ read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start));
+ } else {
+ read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start);
+ }
if (flag) {
- cout << "DEL2" << endl;
+ cout << "INS: " << endl;
+ cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ cout << "Seq+:" << read.sequence << endl;
}
- } else {
- read.SV |= 'n';
- }
-
- } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads!
- if (flag) {
- cout << "DUP: " << endl;
}
- read.SV |= DUP;
+ read.SV |= INS;
} else {
- if (flag) {
- cout << "N" << endl;
- }
- read.SV = 'n';
+ read.SV |= 'n';
}
- } else { // if first part of read is in a different direction as the second part-> INV
-
- read.strand.first = events[i - 1].strand;
- read.strand.second = !events[i].strand;
- bool is_overlapping = overlaps(events[i - 1], events[i]);
- if (is_overlapping && (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size)) {
+ } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
+ if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
+ read.SV |= DEL;
if (flag) {
- std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl;
- }
- read.SV |= NEST;
-
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ cout << "DEL2" << endl;
}
+ } else {
+ read.SV |= 'n';
+ }
- if (svs.start.min_pos > svs.stop.max_pos) {
- long tmp = svs.start.min_pos;
- svs.start.min_pos = svs.stop.max_pos;
- svs.stop.max_pos = tmp;
- }
- } else if (!is_overlapping) {
- read.SV |= INV;
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
- }
+ } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads!
+ if (flag) {
+ cout << "DUP: " << endl;
+ }
+ read.SV |= DUP;
+ } else {
+ if (flag) {
+ cout << "N" << endl;
}
+ read.SV = 'n';
}
+ } else { // if first part of read is in a different direction as the second part-> INV
- } else { //if not on the same chr-> TRA
read.strand.first = events[i - 1].strand;
- read.strand.second = !events[i].strand;
- if (events[i - 1].strand == events[i].strand) {
- //check this with + - strands!!
+ read.strand.second = !events[i].strand; //TODO think about this! potential not!
+
+ bool is_overlapping = overlaps(events[i - 1], events[i]);
+ if (is_overlapping && (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size)) {
+ if (flag) {
+ std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl;
+ }
+ read.SV |= NEST;
if (events[i - 1].strand) {
svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
} else {
svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
}
- } else {
+
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ long tmp = svs.start.min_pos;
+ svs.start.min_pos = svs.stop.max_pos;
+ svs.stop.max_pos = tmp;
+ }
+ } else if (!is_overlapping) {
+ read.SV |= INV;
if (events[i - 1].strand) {
svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
} else {
svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
}
}
- read.SV |= TRA;
}
- if (read.SV != 'n') {
- if (flag) {
- std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref);
- if (events[i - 1].strand) {
- std::cout << " +";
- } else {
- std::cout << " -";
- }
- if (events[i].strand) {
- std::cout << " +";
- } else {
- std::cout << " -";
- }
- std::cout << " " << tmp->getName() << std::endl;
- std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl;
- }
- //std::cout<<"split"<<std::endl;
-
- svs.start.max_pos = svs.start.min_pos;
- svs.stop.min_pos = svs.stop.max_pos;
- if (svs.start.min_pos > svs.stop.max_pos) {
- //maybe we have to invert the directions???
- svs_breakpoint_str pos = svs.start;
- svs.start = svs.stop;
- svs.stop = pos;
-
- pair<bool, bool> tmp = read.strand;
+ } else { //if not on the same chr-> TRA
+ read.strand.first = events[i - 1].strand;
+ read.strand.second = !events[i].strand;
+ if (events[i - 1].strand == events[i].strand) {
+ //check this with + - strands!!
- read.strand.first = tmp.second;
- read.strand.second = tmp.first;
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
}
-
- //TODO: we might not need this:
- if (svs.start.min_pos > svs.stop.max_pos) {
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
+ } else {
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
} else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
}
+ }
+ read.SV |= TRA;
+ }
- //pool out?
- read.id = read_id;
- svs.support[tmp->getName()] = read;
- svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
- Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
- //std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
- if (add) {
- bst.insert_existant(point, root);
+ if (read.SV != 'n') {
+ if (flag) {
+ std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref);
+ if (events[i - 1].strand) {
+ std::cout << " +";
+ } else {
+ std::cout << " -";
+ }
+ if (events[i].strand) {
+ std::cout << " +";
} else {
- bst.insert(point, root);
+ std::cout << " -";
}
- // std::cout<<"Print:"<<std::endl;
- // bst.print(root);
+ std::cout << " " << tmp->getName() << std::endl;
+ std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl;
}
+ //std::cout<<"split"<<std::endl;
+
+ svs.start.max_pos = svs.start.min_pos;
+ svs.stop.min_pos = svs.stop.max_pos;
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ //maybe we have to invert the directions???
+ svs_breakpoint_str pos = svs.start;
+ svs.start = svs.stop;
+ svs.stop = pos;
+
+ pair<bool, bool> tmp = read.strand;
+
+ read.strand.first = tmp.second;
+ read.strand.second = tmp.first;
+ }
+
+ //TODO: we might not need this:
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
+ } else {
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
+ }
+
+ //pool out?
+ read.id = read_id;
+ svs.support[tmp->getName()] = read;
+ svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
+ Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
+ //std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
+ if (add) {
+ bst.insert_existant(point, root);
+ } else {
+ bst.insert(point, root);
+ }
+ // std::cout<<"Print:"<<std::endl;
+ // bst.print(root);
}
+ }
//}
}
@@ -677,7 +763,7 @@ void estimate_parameters(std::string read_filename) {
ref = mapped_file->get_refInfo();
} else {
cerr << "File Format not recognized. File must be a sorted .bam file!" << endl;
- exit(0);
+ exit(EXIT_FAILURE);
}
Alignment * tmp_aln = mapped_file->parseRead(Parameter::Instance()->min_mq);
@@ -701,7 +787,7 @@ void estimate_parameters(std::string read_filename) {
double avg_del = 0;
double avg_ins = 0;
vector<int> tmp = tmp_aln->get_avg_diff(dist, avg_del, avg_ins);
- // std::cout<<"Debug:\t"<<avg_del<<" "<<avg_ins<<endl;
+ // std::cout<<"Debug:\t"<<avg_del<<" "<<avg_ins<<endl;
tot_avg_ins += avg_ins;
tot_avg_del += avg_del;
//
@@ -731,7 +817,7 @@ void estimate_parameters(std::string read_filename) {
}
if (num == 0) {
std::cerr << "Too few reads detected in " << Parameter::Instance()->bam_files[0] << std::endl;
- exit(1);
+ exit(EXIT_FAILURE);
}
vector<int> nums;
size_t pos = 0;
=====================================
src/sub/Detect_Breakpoints.h
=====================================
@@ -37,6 +37,7 @@ void estimate_parameters(std::string read_filename);
bool overlaps(aln_str prev,aln_str curr);
void detect_merged_svs(Breakpoint * point);
+long get_ref_lengths(int id, RefVector ref);
std::string TRANS_type(char type);
=====================================
src/tree/Breakpoint_Tree.cpp
=====================================
@@ -37,7 +37,7 @@ void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_
} else {
par->ref_support.second++;
}
- // std::cout<<"start: "<<start<<" "<<stop<<std::endl;
+ std::cout<<"Hit start: "<<start<<" "<<stop<<std::endl;
}
} else { //stop coordinate
if ((par->position > start + 100 && par->position < stop - 100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
@@ -46,10 +46,12 @@ void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_
} else {
par->ref_support.second++;
}
- // std::cout<<"stop: "<< start<<" "<<stop<<std::endl;
+ std::cout<<"Hit stop: "<< start<<" "<<stop<<std::endl;
}
}
+// std::cout<<" No hit: "<< start<<" "<<par->position <<" "<<chr<<" "<<par->chr<<std::endl;
+
//search goes on:
if (start < par->position) {
overalps(start, stop, chr, par->left, ref_strand);
=====================================
src/tree/IntervallTree.cpp
=====================================
@@ -131,16 +131,19 @@ bool IntervallTree::overlaps(long start, long stop, TNode *p) {
} else {
long score = p->get_data()->overlap_breakpoint(start,stop);
if (score > 0) {
- overlaps(start,stop, p->left);
+ return overlaps(start,stop, p->left);
} else if (score < 0) {
- overlaps(start,stop, p->right);
+ return overlaps(start,stop, p->right);
} else {
return true;
}
}
+// return false;
}
+
+
// Finding the Smallest
-TNode * IntervallTree::findmin(TNode * p) {
+/*TNode * IntervallTree::findmin(TNode * p) {
if (p == NULL) {
std::cout << "The tree is empty\n" << std::endl;
return p;
@@ -164,7 +167,9 @@ TNode * IntervallTree::findmax(TNode * p) {
}
return p;
}
-}
+}*/
+
+
// Finding an get_value()
void IntervallTree::find(Breakpoint * point, TNode * &p) {
if (p == NULL) {
@@ -261,10 +266,12 @@ void IntervallTree::preorder(TNode * p) {
preorder(p->right);
}
}
+
+
void IntervallTree::get_breakpoints(TNode *p, std::vector<Breakpoint *> & points) {
if (p != NULL) {
get_breakpoints(p->right, points);
- //std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" )"<<std::endl;
+ // std::cout << "( " << p->get_data()->get_coordinates().start.min_pos << "-" << p->get_data()->get_coordinates().stop.max_pos << " "<< p->get_data()->get_coordinates().support.size()<<" "<< TRANS_type2((*p->get_data()->get_coordinates().support.begin()).second.SV)<<" )"<<std::endl;
points.push_back(p->get_data());
get_breakpoints(p->left, points);
}
=====================================
src/tree/TNode.h
=====================================
@@ -46,7 +46,7 @@ public:
}
Breakpoint * get_data() {
- return data;
+ return this->data;
}
int get_height() {
return height;
View it on GitLab: https://salsa.debian.org/med-team/sniffles/-/compare/b55f885cde60c9e2bc045ef48979ecff06a7b9b0...2e3595cf6e0bdc972fb214bb16612243ba1a36cc
--
View it on GitLab: https://salsa.debian.org/med-team/sniffles/-/compare/b55f885cde60c9e2bc045ef48979ecff06a7b9b0...2e3595cf6e0bdc972fb214bb16612243ba1a36cc
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200902/81d480d1/attachment-0001.html>
More information about the debian-med-commit
mailing list