[med-svn] [Git][med-team/sniffles][upstream] New upstream version 1.0.10+ds
Steffen Möller
gitlab at salsa.debian.org
Mon Oct 15 18:20:26 BST 2018
Steffen Möller pushed to branch upstream at Debian Med / sniffles
Commits:
7418012d by Steffen Moeller at 2018-10-15T17:10:17Z
New upstream version 1.0.10+ds
- - - - -
21 changed files:
- + .project
- CMakeLists.txt
- README.md
- src/Alignment.cpp
- src/ArgParseOutput.h
- src/Genotyper/Genotyper.cpp
- src/Genotyper/Genotyper.h
- src/Paramer.h
- src/Parser.h
- src/Sniffles.cpp
- src/force_calling/Force_calling.cpp
- src/force_calling/VCF_parser.cpp
- src/print/BedpePrinter.cpp
- src/print/VCFPrinter.cpp
- src/sub/Breakpoint.cpp
- src/sub/Detect_Breakpoints.cpp
- src/tree/Breakpoint_Tree.cpp
- src/tree/Breakpoint_Tree.h
- src/tree/IntervallTree.cpp
- test_set/README
- + test_set/test.vcf
Changes:
=====================================
.project
=====================================
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+ <name>Sniffles</name>
+ <comment></comment>
+ <projects>
+ </projects>
+ <buildSpec>
+ </buildSpec>
+ <natures>
+ </natures>
+</projectDescription>
=====================================
CMakeLists.txt
=====================================
@@ -7,9 +7,9 @@ option(STATIC "Build static binary" OFF)
set( SNIF_VERSION_MINOR 0 )
IF(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Building in debug mode!")
- set( SNIF_VERSION_BUILD 8-debug )
+ set( SNIF_VERSION_BUILD 10-debug )
else()
- set( SNIF_VERSION_BUILD 8 )
+ set( SNIF_VERSION_BUILD 10 )
ENDIF()
=====================================
README.md
=====================================
@@ -4,6 +4,24 @@ Sniffles is a structural variation caller using third generation sequencing (Pac
Please see our github wiki for more information (https://github.com/fritzsedlazeck/Sniffles/wiki)
+
+# How to build Sniffles
+<pre>wget https://github.com/fritzsedlazeck/Sniffles/archive/master.tar.gz -O Sniffles.tar.gz
+tar xzvf Sniffles.tar.gz
+cd Sniffles-master/
+mkdir -p build/
+cd build/
+cmake ..
+make
+
+cd ../bin/sniffles*
+./sniffles</pre>
+
+Note Mac users often have to provide parameters to the cmake command:
+<pre>cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
+</pre>
+
+
**************************************
## NGMLR
Sniffles performs best with the mappings of NGMLR our novel long read mapping method.
@@ -12,8 +30,8 @@ https://github.com/philres/ngmlr
****************************************
## Citation:
-Please see and cite our preprint:
-http://www.biorxiv.org/content/early/2017/07/28/169557
+Please see and cite our paper:
+https://www.nature.com/articles/s41592-018-0001-7
**************************************
## Poster & Talks:
=====================================
src/Alignment.cpp
=====================================
@@ -75,7 +75,7 @@ vector<differences_str> Alignment::summarize_csstring(std::vector<indel_str> &de
vector<differences_str> events;
differences_str ev;
- std::cout<<"CS: "<<cs<<std::endl;
+ std::cout << "CS: " << cs << std::endl;
for (size_t i = 0; i < cs.size() && pos < max_size; i++) {
ev.position = pos;
if (cs[i] == ':') { //match (can be numbers or bp!)
@@ -88,7 +88,7 @@ vector<differences_str> Alignment::summarize_csstring(std::vector<indel_str> &de
num++;
}
}
- cout<<"\tMatch: "<<num<<std::endl;
+ cout << "\tMatch: " << num << std::endl;
pos += num;
} else if (cs[i] == '*') { //mismatch (ref,alt) pairs
//only every second char counts!
@@ -96,7 +96,7 @@ vector<differences_str> Alignment::summarize_csstring(std::vector<indel_str> &de
pos++;
i += 2;
ev.type = 0; //mismatch
- cout<<"\tMiss: "<<1<<std::endl;
+ cout << "\tMiss: " << 1 << std::endl;
} else if (cs[i] == '-') { //del
//collet del seq in dels!
indel_str del;
@@ -108,21 +108,21 @@ vector<differences_str> Alignment::summarize_csstring(std::vector<indel_str> &de
dels.push_back(del);
pos += del.sequence.size();
ev.type = del.sequence.size();
- cout<<"\tDEL: "<<del.sequence.size()<<std::endl;
+ cout << "\tDEL: " << del.sequence.size() << std::endl;
} else if (cs[i] == '+') { //ins
- int num=0;
+ int num = 0;
while ((i < cs.size() && cs[i] != '+') && (cs[i] != '-' && cs[i] != '+')) {
i++;
num++;
}
- ev.type =num*-1;
- cout<<"\tINS: "<<num<<std::endl;
+ ev.type = num * -1;
+ cout << "\tINS: " << num << std::endl;
}
events.push_back(ev);
}
- std::cout<<"end CS: "<<cs<<std::endl;
+ std::cout << "end CS: " << cs << std::endl;
return events;
}
@@ -136,6 +136,7 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
if (al->CigarData[0].Type == 'S') {
read_pos += al->CigarData[0].Length;
}
+
for (size_t i = 0; i < al->CigarData.size(); i++) {
if (al->CigarData[i].Type == 'M') {
pos += al->CigarData[i].Length;
@@ -176,15 +177,16 @@ vector<differences_str> Alignment::summarizeAlignment(std::vector<indel_str> &de
}
}
- /*if (flag) {
- std::cout << "FIRST:" << std::endl;
- for (size_t i = 0; i < events.size(); i++) {
- if (abs(events[i].type) > 200) {
- cout << events[i].position << " " << events[i].type << endl;
- }
- }
- cout << endl;
- }*/
+ //exit(0);
+ if (flag) {
+ std::cout << "FIRST:" << std::endl;
+ for (size_t i = 0; i < events.size(); i++) {
+ // if (abs(events[i].type) > 200) {
+ cout << events[i].position << " " << events[i].type << endl;
+ // }
+ }
+ cout << endl;
+ }
//set ref length requ. later on:
this->ref_len = pos - getPosition(); //TODO compare to get_length!
@@ -676,6 +678,7 @@ bool Alignment::overlapping_segments(vector<aln_str> entries) {
return (entries.size() == 2 && abs(entries[0].pos - entries[1].pos) < 100);
}
vector<aln_str> Alignment::getSA(RefVector ref) {
+
string sa;
vector<aln_str> entries;
if (al->GetTag("SA", sa) && !sa.empty()) {
@@ -690,7 +693,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
uint32_t sv;
al->GetTag("SV", sv);
tmp.cross_N = ((sv & Ns_CLIPPED));
- bool flag = strcmp(getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0;
+ bool flag = strcmp(getName().c_str(),"0bac61ef-7819-462b-ae3d-32c68fe580c0")==0; //Parameter::Instance()->read_name.c_str()) == 0;
get_coords(tmp, tmp.read_pos_start, tmp.read_pos_stop);
if (flag) {
@@ -791,8 +794,7 @@ vector<aln_str> Alignment::getSA(RefVector ref) {
double Alignment::get_scrore_ratio() {
uint score = -1;
uint subscore = -1;
- if (al->GetTag("AS", score)) {
- al->GetTag("XS", subscore);
+ if (al->GetTag("AS", score) && al->GetTag("XS", subscore)) {
if (subscore == 0) {
subscore = 1;
}
@@ -804,6 +806,7 @@ bool Alignment::get_is_save() {
string sa;
double score = get_scrore_ratio(); //TODO should I use this again for bwa?
+// cout<<score<<endl;
return !((al->GetTag("XA", sa) && !sa.empty()) || (al->GetTag("XT", sa) && !sa.empty())) && (score == -1 || score > Parameter::Instance()->score_treshold); //|| //TODO: 7.5
}
@@ -921,8 +924,9 @@ std::string Alignment::get_md() {
std::string md;
if (al->GetTag("MD", md)) {
return md;
- }else{
- std::cerr<<"No MD string detected! Check bam file! Otherwise generate using e.g. samtools."<<std::endl;
+ } else {
+ std::cerr << "No MD string detected! Check bam file! Otherwise generate using e.g. samtools." << std::endl;
+ cout<<"MD: TEST" << this->getName()<<endl;
exit(0);
}
return md;
@@ -932,8 +936,8 @@ std::string Alignment::get_cs() {
std::string cs;
if (al->GetTag("cs", cs)) {
return cs;
- }else{
- std::cerr<<"No CS string detected! Check bam file!"<<std::endl;
+ } else {
+ std::cerr << "No CS string detected! Check bam file!" << std::endl;
exit(0);
}
return cs;
@@ -1056,6 +1060,8 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
//computeAlignment();
//cout<<alignment.first<<endl;
//cout<<alignment.second<<endl;
+ avg_del = 0;
+ avg_ins = 0;
vector<int> mis_per_window;
std::vector<indel_str> dels;
vector<differences_str> event_aln = summarizeAlignment(dels);
@@ -1070,34 +1076,37 @@ vector<int> Alignment::get_avg_diff(double & dist, double & avg_del, double & av
double del = 0;
double ins = 0;
double mis = 0;
- double length = event_aln[event_aln.size() - 1].position - event_aln[0].position;
- for (size_t i = 0; i < event_aln.size(); i++) {
- if (i != 0) {
- dist += event_aln[i].position - event_aln[i - 1].position;
- }
+ if (event_aln.size() > 1) {
+ double length = event_aln[event_aln.size() - 1].position - event_aln[0].position;
+ for (size_t i = 0; i < event_aln.size(); i++) {
+ if (i != 0) {
+ dist += event_aln[i].position - event_aln[i - 1].position;
+ }
- pair_str tmp;
- tmp.position = -1;
- if (event_aln[i].type == 0) {
- tmp = plane->add_mut(event_aln[i].position, 1, min_tresh);
- } else {
- tmp = plane->add_mut(event_aln[i].position, abs(event_aln[i].type), min_tresh);
- }
- if (tmp.position != -1) { //check that its not the prev event!
- mis_per_window.push_back(tmp.coverage); //store #mismatch per window each time it exceeds. (which might be every event position!)
- }
- if (event_aln[i].type > 0) {
- avg_del += event_aln[i].type;
- } else if (event_aln[i].type < 0) {
- avg_ins += event_aln[i].type * -1;
+ pair_str tmp;
+ tmp.position = -1;
+ if (event_aln[i].type == 0) {
+ tmp = plane->add_mut(event_aln[i].position, 1, min_tresh);
+ } else {
+ tmp = plane->add_mut(event_aln[i].position, abs(event_aln[i].type), min_tresh);
+ }
+ if (tmp.position != -1) { //check that its not the prev event!
+ mis_per_window.push_back(tmp.coverage); //store #mismatch per window each time it exceeds. (which might be every event position!)
+ }
+ if (event_aln[i].type > 0) {
+ avg_del += event_aln[i].type;
+ } else if (event_aln[i].type < 0) {
+ avg_ins += event_aln[i].type * -1;
+ }
}
- }
+ //cout << "len: " << length << endl;
+ avg_ins = avg_ins / length;
+ avg_del = avg_del / length;
- avg_ins = avg_ins / length;
- avg_del = avg_del / length;
-
- dist = dist / (double) event_aln.size();
+ dist = dist / (double) event_aln.size();
+ }
plane->finalyze();
+
return mis_per_window; //total_num /num;
}
@@ -1108,12 +1117,12 @@ vector<str_event> Alignment::get_events_Aln() {
//clock_t comp_aln = clock();
std::vector<indel_str> dels;
vector<differences_str> event_aln;
- if (Parameter::Instance()->cs_string) {
- cout<<"run cs check "<<std::endl;
- event_aln = summarize_csstring(dels);
- } else {
- event_aln = summarizeAlignment(dels);
- }
+ /* if (Parameter::Instance()->cs_string) {
+ cout<<"run cs check "<<std::endl;
+ event_aln = summarize_csstring(dels);
+ } else {*/
+ event_aln = summarizeAlignment(dels);
+// }
//double time2 = Parameter::Instance()->meassure_time(comp_aln, "\tcompAln Events: ");
vector<str_event> events;
=====================================
src/ArgParseOutput.h
=====================================
@@ -11,8 +11,7 @@ using std::cerr;
using std::cout;
using std::endl;
-class ArgParseOutput : public TCLAP::StdOutput
-{
+class ArgParseOutput: public TCLAP::StdOutput {
private:
std::string usageStr;
@@ -36,6 +35,8 @@ public:
cerr << endl;
cerr << "Short usage:" << endl;
cerr << " sniffles [options] -m <sorted.bam> -v <output.vcf> " << endl;
+ cerr << "Version: " << Parameter::Instance()->version << std::endl;
+ cerr << "Contact: fritz.sedlazeck at gmail.com" << std::endl;
cerr << endl;
cerr << "For complete USAGE and HELP type:" << endl;
cerr << " sniffles --help" << endl;
=====================================
src/Genotyper/Genotyper.cpp
=====================================
@@ -13,25 +13,32 @@ std::string Genotyper::assess_genotype(int ref, int support) {
if (allele < Parameter::Instance()->min_allelel_frequency) {
return "";
}
+ if((support + ref)==0){
+ allele=0;
+ }
std::stringstream ss;
ss << ";AF=";
ss << allele;
ss << "\tGT:DR:DV\t";
- if (allele > Parameter::Instance()->homfreq) {
- ss << "1/1:";
+ if(ref==0 && support==0){
+ ss << "./."; //we cannot define it.
+ }else if (allele > Parameter::Instance()->homfreq) {
+ ss << "1/1";
} else if (allele > Parameter::Instance()->hetfreq) {
- ss << "0/1:";
+ ss << "0/1";
} else {
- ss << "0/0:";
+ ss << "0/0";
}
+ ss<<":";
ss << ref;
ss << ":";
ss << support;
return ss.str();
}
-std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
+std::string Genotyper::mod_breakpoint_vcf(string buffer, std::pair<int, int> ref_strand) {
+ int ref=ref_strand.first+ref_strand.second;
//find last of\t
//parse #reads supporting
//print #ref
@@ -41,6 +48,12 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
pos = buffer.find_last_of("GT");
//tab
entry = buffer.substr(0, pos - 2);
+ std::stringstream ss;
+ ss << ";REF_strand=";
+ ss << ref_strand.first;
+ ss << ",";
+ ss << ref_strand.second;
+ entry +=ss.str();
buffer = buffer.substr(pos + 1); // the right part is only needed:
pos = buffer.find_last_of(':');
@@ -54,8 +67,9 @@ std::string Genotyper::mod_breakpoint_vcf(string buffer, int ref) {
}
-std::string Genotyper::mod_breakpoint_bedpe(string buffer, int ref) {
+std::string Genotyper::mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref_strand) {
+ int ref=ref_strand.first+ref_strand.second;
std::string tmp = buffer;
std::string entry = tmp;
entry += '\t';
@@ -198,11 +212,24 @@ void Genotyper::update_file(Breakpoint_Tree & tree, breakpoint_node *& node) {
tmp = get_breakpoint_bedpe(buffer);
}
- int ref = max(tree.get_ref(node, tmp.chr, tmp.pos), tree.get_ref(node, tmp.chr2, tmp.pos2));
+ std::pair<int,int> first_node=tree.get_ref(node, tmp.chr, tmp.pos);
+ std::pair<int,int> second_node=tree.get_ref(node, tmp.chr2, tmp.pos2);
+
+ std::pair<int,int> final_ref;
+ if(first_node.first+first_node.second>second_node.first+second_node.second){
+ final_ref=first_node;
+ }else{
+ final_ref=second_node;
+ }
+
+ if(final_ref.first==-1){
+ std::cerr<<"Error in GT: Tree node not found. Exiting."<<std::endl;
+ exit(0);
+ }
if (is_vcf) {
- to_print = mod_breakpoint_vcf(buffer, ref);
+ to_print = mod_breakpoint_vcf(buffer,final_ref);
} else {
- to_print = mod_breakpoint_bedpe(buffer, ref);
+ to_print = mod_breakpoint_bedpe(buffer,final_ref);
}
if (!to_print.empty()) {
fprintf(file, "%s", to_print.c_str());
@@ -266,7 +293,7 @@ std::vector<std::string> Genotyper::read_SVs(Breakpoint_Tree & tree, breakpoint_
tree.insert(node, tmp.chr, tmp.pos, true); //true: start;
tree.insert(node, tmp.chr2, tmp.pos2, false); //false: stop;//
num_sv++;
- if (num_sv % 1000 == 0) {
+ if (num_sv % 5000 == 0) {
cout << "\t\tRead in SV: " << num_sv << endl;
}
} else if (buffer[2] == 'c' && buffer[3] == 'o') { //##contig=<ID=chr1,length=699930>
@@ -300,7 +327,11 @@ void Genotyper::compute_cov(Breakpoint_Tree & tree, breakpoint_node *& node, std
cout << "\t\tScanning CHR " << ref_dict[tmp.chr_id] << endl;
prev_id = tmp.chr_id;
}
- tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node);
+ if (tmp.strand == 1) { //strand of read
+ tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node, true);
+ } else {
+ tree.overalps(tmp.start, tmp.start + tmp.length, ref_dict[tmp.chr_id], node, false);
+ }
nbytes = fread(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
fclose(ref_allel_reads);
=====================================
src/Genotyper/Genotyper.h
=====================================
@@ -25,8 +25,8 @@ private:
void update_file(Breakpoint_Tree & tree,breakpoint_node *& node);
variant_str get_breakpoint_vcf(string buffer);
variant_str get_breakpoint_bedpe(string buffer);
- std::string mod_breakpoint_vcf(string buffer, int ref);
- std::string mod_breakpoint_bedpe(string buffer, int ref);
+ std::string mod_breakpoint_vcf(string buffer, std::pair<int,int> ref);
+ std::string mod_breakpoint_bedpe(string buffer, std::pair<int,int> ref);
void parse_pos(char * buffer, int & pos, std::string & chr);
=====================================
src/Paramer.h
=====================================
@@ -25,7 +25,7 @@ class Parameter {
private:
Parameter() {
window_thresh=10;//TODO check!
- version="1.0.8";
+ version="1.0.10";
huge_ins = 999;//TODO check??
}
~Parameter() {
@@ -84,6 +84,7 @@ public:
bool change_coords; //indicating for --Ivcf
bool skip_parameter_estimation;
bool cs_string;
+ bool read_strand;
void set_regions(std::string reg) {
size_t i = 0;
=====================================
src/Parser.h
=====================================
@@ -14,6 +14,7 @@ struct str_read{
uint chr_id;
uint start;
ushort length;
+ ushort strand; //1=forward, 2=backward;
};
class Parser {
=====================================
src/Sniffles.cpp
=====================================
@@ -2,16 +2,16 @@
// Name : Sniffles.cpp
// Author : Fritz Sedlazeck
// Version :
-// Copyright : Your copyright notice
-// Description : Hello World in C++, Ansi-style
+// Copyright : MIT License
+// Description : Detection of SVs for long read data.
//============================================================================
-// phil: cd ~/hetero/philipp/pacbio/example-svs/reads
//For mac: cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
#include <iostream>
#include "Paramer.h"
#include <tclap/CmdLine.h>
#include <unistd.h>
#include <omp.h>
+
#include "Genotyper/Genotyper.h"
#include "realign/Realign.h"
#include "sub/Detect_Breakpoints.h"
@@ -25,7 +25,9 @@
#include "ArgParseOutput.h"
#include "force_calling/Force_calling.h"
-//cmake -D CMAKE_C_COMPILER=/opt/local/bin/gcc-mp-4.7 -D CMAKE_CXX_COMPILER=/opt/local/bin/g++-mp-4.7 ..
+//cmake -D CMAKE_C_COMPILER=/usr/local/bin/gcc-8 -D CMAKE_CXX_COMPILER=/usr/local/bin/g++-8 ..
+
+
//TODO:
//check strand headers.
@@ -86,12 +88,12 @@ void read_parameters(int argc, char *argv[]) {
TCLAP::SwitchArg arg_genotype("", "genotype", "Enables Sniffles to compute the genotypes.", cmd, false);
TCLAP::SwitchArg arg_cluster("", "cluster", "Enables Sniffles to phase SVs that occur on the same reads", cmd, false);
TCLAP::SwitchArg arg_std("", "ignore_sd", "Ignores the sd based filtering. ", cmd, false);
- TCLAP::SwitchArg arg_bnd("", "report_BND", "Report BND instead of Tra in vcf output. ", cmd, false);
+ TCLAP::SwitchArg arg_bnd("", "report_BND", "Dont report BND instead use Tra in vcf output. ", cmd, true);
TCLAP::SwitchArg arg_seq("", "report_seq", "Report sequences for indels in vcf output. (Beta version!) ", cmd, false);
TCLAP::SwitchArg arg_coords("", "change_coords", "Adopt coordinates for force calling if finding evidence. ", cmd, false);
TCLAP::SwitchArg arg_parameter("", "skip_parameter_estimation", "Enables the scan if only very few reads are present. ", cmd, false);
TCLAP::SwitchArg arg_cs_string("", "cs_string", "Enables the scan of CS string instead of Cigar and MD. ", cmd, false);
-
+ TCLAP::SwitchArg arg_read_strand("", "report_read_strands", "Enables the report of the strand categories per read. (Beta) ", cmd, false);
TCLAP::ValueArg<float> arg_allelefreq("f", "allelefreq", "Threshold on allele frequency (0-1). ", false, 0.0, "float", cmd);
TCLAP::ValueArg<float> arg_hetfreq("", "min_het_af", "Threshold on allele frequency (0-1). ", false, 0.3, "float", cmd);
@@ -102,9 +104,11 @@ void read_parameters(int argc, char *argv[]) {
std::stringstream usage;
usage << "" << std::endl;
usage << "Usage: sniffles [options] -m <sorted.bam> -v <output.vcf> " << std::endl;
-
- usage << "" << std::endl;
+ usage << "Version: "<<Parameter::Instance()->version << std::endl;
+ usage << "Contact: fritz.sedlazeck at gmail.com" << std::endl;
+ usage << std::endl;
usage << "Input/Output:" << std::endl;
+
printParameter<std::string>(usage, arg_bamfile);
printParameter<std::string>(usage, arg_vcf);
printParameter<std::string>(usage, arg_bedpe);
@@ -138,6 +142,7 @@ void read_parameters(int argc, char *argv[]) {
printParameter(usage, arg_bnd);
printParameter(usage, arg_seq);
printParameter(usage, arg_std);
+ printParameter(usage,arg_read_strand);
usage << "" << std::endl;
usage << "Parameter estimation:" << std::endl;
@@ -174,7 +179,7 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->change_coords = arg_coords.getValue();
Parameter::Instance()->debug = true;
Parameter::Instance()->score_treshold = 10;
- Parameter::Instance()->read_name = " "; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
+ Parameter::Instance()->read_name = "0bac61ef-7819-462b-ae3d-32c68fe580c0"; //21_16296949_+";//21_40181680_-";//m151102_123142_42286_c100922632550000001823194205121665_s1_p0/80643/0_20394"; //"22_36746138"; //just for debuging reasons!
Parameter::Instance()->bam_files.push_back(arg_bamfile.getValue());
Parameter::Instance()->min_mq = arg_mq.getValue();
Parameter::Instance()->output_vcf = arg_vcf.getValue();
@@ -200,6 +205,7 @@ void read_parameters(int argc, char *argv[]) {
Parameter::Instance()->hetfreq = arg_hetfreq.getValue();
Parameter::Instance()->skip_parameter_estimation = arg_parameter.getValue();
Parameter::Instance()->cs_string = arg_cs_string.getValue();
+ Parameter::Instance()->read_strand=arg_read_strand.getValue();
if (Parameter::Instance()->skip_parameter_estimation) {
cout<<"\tSkip parameter estimation."<<endl;
=====================================
src/force_calling/Force_calling.cpp
=====================================
@@ -39,29 +39,33 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
//parse VCF file
std::vector<strvcfentry> entries = parse_vcf(Parameter::Instance()->input_vcf, 0);
std::cout << "\t\t" << entries.size() << " SVs found in input." << std::endl;
- int invalid_svs=0;
+ int invalid_svs = 0;
for (size_t i = 0; i < entries.size(); i++) {
if (entries[i].type != -1) {
position_str svs;
- //cout<<"start: "<<entries[i].start.chr << " stop "<<entries[i].stop.chr<<endl;
+
+ if (ref_lens.find(entries[i].start.chr) == ref_lens.end()) { // check why this is not called!
+ cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].start.chr << endl;
+ exit(0);
+ }
+ if (ref_lens.find(entries[i].stop.chr) == ref_lens.end()) {
+ cerr << "Error undefined CHR in VCF vs. BAM header: " << entries[i].stop.chr << endl;
+ exit(0);
+ }
svs.start.min_pos = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
svs.stop.max_pos = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
read_str read;
- if(ref_lens.find(entries[i].start.chr)==ref_lens.end()){
- cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].start.chr<<endl;
- }
- if(ref_lens.find(entries[i].stop.chr)==ref_lens.end()){
- cerr<<"Warning undefined CHR in VCF vs. BAM header: "<<entries[i].stop.chr<<endl;
- }
+
read.coordinates.first = (long) entries[i].start.pos + ref_lens[entries[i].start.chr];
read.coordinates.second = (long) entries[i].stop.pos + ref_lens[entries[i].stop.chr];
if (entries[i].type == 4) { //ins?
- if(entries[i].sv_len== Parameter::Instance()->huge_ins){
+ if (entries[i].sv_len == Parameter::Instance()->huge_ins) {
entries[i].sv_len++; // bad hack!
}
+
svs.stop.max_pos += (long) entries[i].sv_len;
- read.coordinates.second+=(long) entries[i].sv_len;
- // cout << "Parse: " << entries[i].start.pos << " " << entries[i].stop.pos << " " << svs.start.min_pos <<" "<<svs.stop.max_pos << endl;
+ read.coordinates.second += (long) entries[i].sv_len;
+ // cout << "Parse: " << entries[i].start.pos << " " << entries[i].stop.pos << " " << svs.start.min_pos <<" "<<svs.stop.max_pos << endl;
}
read.SV = assign_type(entries[i].type);
@@ -69,17 +73,18 @@ void fill_tree(IntervallTree & final, TNode *& root_final, RefVector ref, std::m
read.type = 2; //called
read.length = entries[i].sv_len; //svs.stop.max_pos-svs.start.min_pos;//try
svs.support["input"] = read;
+ // cout<<"Submit: "<<entries[i].type <<endl;
Breakpoint * br = new Breakpoint(svs, (long) entries[i].sv_len, read.SV);
final.insert(br, root_final);
} else {
invalid_svs++;
-
}
}
- cerr << "Invalid types found skipping " << invalid_svs <<" entries." << endl;
+ cerr << "\tInvalid types found skipping " << invalid_svs << " entries." << endl;
//std::cout << "Print:" << std::endl;
//final.print(root_final);
entries.clear();
+ //exit(0);
}
void force_calling(std::string bam_file, IPrinter *& printer) {
@@ -106,7 +111,7 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
fill_tree(final, root_final, ref, ref_lens);
int current_RefID = 0;
- std::cout << "Start parsing: Chr "<<ref[current_RefID].RefName << std::endl;
+ std::cout << "Start parsing: Chr " << ref[current_RefID].RefName << std::endl;
//FILE * alt_allel_reads;
FILE * ref_allel_reads;
@@ -210,5 +215,5 @@ void force_calling(std::string bam_file, IPrinter *& printer) {
points[i]->predict_SV();
printer->printSV(points[i]); //redo! Ignore min support + STD etc.
}
- std::cout << "Fin!" << std::endl;
+
}
=====================================
src/force_calling/VCF_parser.cpp
=====================================
@@ -354,7 +354,7 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
if (count == 7 && strncmp(&buffer[i], ";STRANDS=", 9) == 0) {
set_strand = true;
tmp.strands.first = (bool) (buffer[i + 9] == '+');
- tmp.strands.second = (bool) (buffer[i + 10] == '+');
+ tmp.strands.second = (bool)(buffer[i + 10] == '+');
}
if (count == 9 && buffer[i - 1] == '\t') { //parsing genotype;
@@ -431,7 +431,6 @@ std::vector<strvcfentry> parse_vcf(std::string filename, int min_svs) {
}
if ((strcmp(tmp.start.chr.c_str(), tmp.stop.chr.c_str()) != 0 || (tmp.sv_len >= min_svs))) { // || tmp.type==4
-
if (tmp.type == 5) { //BND
if (strcmp(tmp.stop.chr.c_str(), tmp.start.chr.c_str()) == 0) {
tmp.type = 2;
=====================================
src/print/BedpePrinter.cpp
=====================================
@@ -8,7 +8,7 @@
#include "BedpePrinter.h"
void BedpePrinter::print_header() {
- fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_split_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\tFILTER\n");
+ fprintf(file, "%s", "#Chrom\tstart\tstop\tchrom2\tstart2\tstop2\tvariant_name/ID\tscore (smaller is better)\tstrand1\tstrand2\ttype\tnumber_of_supporting_reads\tbest_chr1\tbest_start\tbest_chr2\tbest_stop\tpredicted_length\tFILTER\n");
}
void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (!this->bed_tree.is_in(SV->get_coordinates().start.most_support, this->root) && !this->bed_tree.is_in(SV->get_coordinates().stop.most_support, this->root)) {
@@ -103,8 +103,6 @@ void BedpePrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", "\tIMPRECISE");
}
- //fprintf(file, "%c", '\t');
- //fprintf(file, "%i", SV->get_support());
fprintf(file, "%c", '\n');
}
}
=====================================
src/print/VCFPrinter.cpp
=====================================
@@ -8,7 +8,7 @@
#include "VCFPrinter.h"
void VCFPrinter::print_header() {
- fprintf(file, "%s", "##fileformat=VCFv4.2\n");
+ fprintf(file, "%s", "##fileformat=VCFv4.3\n");
fprintf(file, "%s", "##source=Sniffles\n");
string time = currentDateTime();
fprintf(file, "%s", "##fileDate=");
@@ -37,18 +37,28 @@ void VCFPrinter::print_header() {
fprintf(file, "%s", "##INFO=<ID=RE,Number=1,Type=Integer,Description=\"read support\">\n");
fprintf(file, "%s", "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">\n");
fprintf(file, "%s", "##INFO=<ID=PRECISE,Number=0,Type=Flag,Description=\"Precise structural variation\">\n");
+ fprintf(file, "%s", "##INFO=<ID=UNRESOLVED,Number=0,Type=Flag,Description=\"An insertion that is longer than the read and thus we cannot predict the full size.\">\n");
fprintf(file, "%s", "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Length of the SV\">\n");
+ fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"Length of the SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Type of approach used to detect SV\">\n");
fprintf(file, "%s", "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n");
if (Parameter::Instance()->report_n_reads > 0 || Parameter::Instance()->report_n_reads == -1) {
- fprintf(file, "%s", "##INFO=<ID=RNAMES,Number=1,Type=String,Description=\"Names of reads supporting SVs (comma seperated)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=RNAMES,Number=1,Type=String,Description=\"Names of reads supporting SVs (comma separated)\">\n");
}
+ if (Parameter::Instance()->print_seq) {
+ fprintf(file, "%s", "##INFO=<ID=SEQ,Number=1,Type=String,Description=\"Extracted sequence from the best representative read.\">\n");
+ }
+
+ if (Parameter::Instance()->read_strand) {
+ fprintf(file, "%s", "##INFO=<ID=STRANDS2,Number=4,Type=Integer,Description=\"alt reads first + ,alt reads first -,alt reads second + ,alt reads second -.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=REF_strand,Number=2,Type=Integer,Description=\"plus strand ref, minus strand ref.\">\n");
+ }
+
fprintf(file, "%s", "##INFO=<ID=STD_quant_start,Number=A,Type=Integer,Description=\"STD of the start breakpoints across the reads.\">\n");
fprintf(file, "%s", "##INFO=<ID=STD_quant_stop,Number=A,Type=Integer,Description=\"STD of the stop breakpoints across the reads.\">\n");
- fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description=\"Kurtosis value of the start breakpoints accross the reads.\">\n");
- fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description=\"Kurtosis value of the stop breakpoints accross the reads.\">\n");
- fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
- fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN)\">\n");
+ fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description=\"Kurtosis value of the start breakpoints across the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description=\"Kurtosis value of the stop breakpoints across the reads.\">\n");
+ fprintf(file, "%s", "##INFO=<ID=SUPTYPE,Number=1,Type=String,Description=\"Type by which the variant is supported.(SR,ALN,NR)\">\n");
fprintf(file, "%s", "##INFO=<ID=STRANDS,Number=A,Type=String,Description=\"Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)\">\n");
fprintf(file, "%s", "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele Frequency.\">\n");
fprintf(file, "%s", "##INFO=<ID=ZMW,Number=A,Type=Integer,Description=\"Number of ZMWs (Pacbio) supporting SV.\">\n");
@@ -89,9 +99,9 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%i", id);
id++;
- long end_coord=SV->get_coordinates().stop.most_support;
+ long end_coord = SV->get_coordinates().stop.most_support;
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
- end_coord=std::max((SV->get_coordinates().stop.most_support - (long)SV->get_length()),(long) start);
+ end_coord = std::max((SV->get_coordinates().stop.most_support - (long) SV->get_length()), (long) start);
}
int end = IPrinter::calc_pos(end_coord, ref, chr);
@@ -123,11 +133,10 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {
fprintf(file, "%s", "\t.\tUNRESOLVED\t");
- }else{
+ } else {
fprintf(file, "%s", "\t.\tPASS\t");
}
-
if (std_quant.first < 10 && std_quant.second < 10) {
fprintf(file, "%s", "PRECISE");
} else {
@@ -144,7 +153,7 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
//if (SV->get_SVtype() & INS) {
// fprintf(file, "%i", std::max((int) (end - SV->get_length()), start));
//} else {
- fprintf(file, "%i", end);
+ fprintf(file, "%i", end);
//}
}
if (zmws != 0) {
@@ -175,15 +184,49 @@ void VCFPrinter::print_body(Breakpoint * &SV, RefVector ref) {
fprintf(file, "%s", SV->get_supporting_types().c_str());
fprintf(file, "%s", ";SVLEN=");
- if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) {//!
+ if (((SV->get_SVtype() & INS) && SV->get_length() == Parameter::Instance()->huge_ins) && SV->get_types().is_ALN) { //!
fprintf(file, "%i", 999999999);
- } else {
+ }else if (SV->get_SVtype() & TRA){
+ fprintf(file, "%i",0);
+ }else if (SV->get_SVtype() & DEL){
+ fprintf(file, "%i", SV->get_length()*-1);
+ }else {
fprintf(file, "%i", SV->get_length());
}
// }
fprintf(file, "%s", ";STRANDS=");
fprintf(file, "%s", strands.c_str());
- if (!SV->get_sequence().empty()) {
+ if (Parameter::Instance()->read_strand) {
+ fprintf(file, "%s", ";STRANDS2=");
+ std::map<std::string, read_str> support = SV->get_coordinates().support;
+ pair<int, int> tmp_start;
+ pair<int, int> tmp_stop;
+ tmp_start.first = 0;
+ tmp_start.second = 0;
+ tmp_stop.first = 0;
+ tmp_stop.second = 0;
+ for (std::map<std::string, read_str>::iterator i = support.begin(); i != support.end(); i++) {
+ if ((*i).second.read_strand.first) {
+ tmp_start.first++;
+ } else {
+ tmp_start.second++;
+ }
+ if ((*i).second.read_strand.second) {
+ tmp_stop.first++;
+ } else {
+ tmp_stop.second++;
+ }
+ }
+ fprintf(file, "%i", tmp_start.first);
+ fprintf(file, "%s", ",");
+ fprintf(file, "%i", tmp_start.second);
+ fprintf(file, "%s", ",");
+ fprintf(file, "%i", tmp_stop.first);
+ fprintf(file, "%s", ",");
+ fprintf(file, "%i", tmp_stop.second);
+ }
+
+ if (Parameter::Instance()->print_seq && !SV->get_sequence().empty()) {
fprintf(file, "%s", ";SEQ=");
fprintf(file, "%s", SV->get_sequence().c_str());
}
@@ -204,6 +247,13 @@ void VCFPrinter::print_body_recall(Breakpoint * &SV, RefVector ref) {
if (Parameter::Instance()->phase) {
store_readnames(SV->get_read_ids(), id);
}
+
+ pair<double, double> kurtosis;
+ pair<double, double> std_quant;
+ double std_length = 0;
+ int zmws = 0;
+ bool ok_to_print = to_print(SV, std_quant, kurtosis, std_length, zmws);
+
std::string chr;
int start = IPrinter::calc_pos(SV->get_coordinates().start.most_support, ref, chr);
fprintf(file, "%s", chr.c_str());
=====================================
src/sub/Breakpoint.cpp
=====================================
@@ -358,9 +358,9 @@ void Breakpoint::predict_SV() {
std::vector<long> stops2;
std::vector<long> lengths2;
- bool scan_reads=true;
- if(positions.support.find("input") != positions.support.end() && !Parameter::Instance()->change_coords){
- scan_reads=false;
+ bool scan_reads = true;
+ if (positions.support.find("input") != positions.support.end() && !Parameter::Instance()->change_coords) {
+ scan_reads = false;
}
for (std::map<std::string, read_str>::iterator i = positions.support.begin(); i != positions.support.end() && scan_reads; i++) {
@@ -441,7 +441,6 @@ void Breakpoint::predict_SV() {
this->indel_sequence = "";
//find the stop coordinate:
-
maxim = 0;
coord = 0;
mean = 0;
@@ -457,6 +456,7 @@ void Breakpoint::predict_SV() {
} else {
this->positions.stop.most_support = coord;
}
+
//find the length:
if (!(this->get_SVtype() & INS)) { //all types but Insertions:
this->length = this->positions.stop.most_support - this->positions.start.most_support;
@@ -479,10 +479,11 @@ void Breakpoint::predict_SV() {
} else {
this->length = coord;
}
- // cout << "third len: " << this->length << endl; // problem!
+ // cout << "third len: " << this->length << endl; // problem!
}
starts.clear();
stops.clear();
+ //strand information:
for (size_t i = 0; i < strands.size(); i++) {
maxim = 0;
std::string id;
@@ -499,19 +500,40 @@ void Breakpoint::predict_SV() {
}
}
strands.clear();
+ }
- std::map<std::string, read_str>::iterator tmp = positions.support.begin();
- int start_prev_dist = 1000;
- int stop_prev_dist = 1000;
- while (tmp != positions.support.end()) {
- int start_dist = abs((*tmp).second.coordinates.first - this->positions.start.most_support);
- int stop_dist = abs((*tmp).second.coordinates.second - this->positions.stop.most_support);
- if (((*tmp).second.SV & this->sv_type) && ((start_dist < start_prev_dist && stop_dist < stop_prev_dist) && (strcmp((*tmp).second.sequence.c_str(), "NA") != 0 && !(*tmp).second.sequence.empty()))) {
- this->indel_sequence = (*tmp).second.sequence;
- }
- tmp++;
+ if (!scan_reads) {
+ // std::cout<<"Test strand"<<std::endl;
+ //if forcecalling we need to define the strands:
+ std::map<std::string, read_str>::iterator it = positions.support.find("input");
+
+ //no output!??
+ /*if ((*it).second.strand.first) {
+ std::cout << "s1: true" << endl;
+ }
+ if ((*it).second.strand.second) {
+ std::cout << "s2: true" << endl;
+ }*/
+ if (it != positions.support.end()) {
+ this->strand.push_back(translate_strand((*it).second.strand));
}
}
+ std::map<std::string, read_str>::iterator tmp = positions.support.begin();
+ int start_prev_dist = 1000;
+ int stop_prev_dist = 1000;
+
+ while (tmp != positions.support.end()) {
+ int start_dist = abs((*tmp).second.coordinates.first - this->positions.start.most_support);
+ int stop_dist = abs((*tmp).second.coordinates.second - this->positions.stop.most_support);
+ if (((*tmp).second.SV & this->sv_type) && ((start_dist < start_prev_dist && stop_dist < stop_prev_dist) && (strcmp((*tmp).second.sequence.c_str(), "NA") != 0 && !(*tmp).second.sequence.empty()))) {
+ this->indel_sequence = (*tmp).second.sequence;
+
+ stop_prev_dist = stop_dist; //new to test!
+ start_prev_dist = start_dist;
+ }
+ tmp++;
+ }
+ //}
this->supporting_types = "";
if (positions.support.find("input") != positions.support.end()) {
if (num == 0) {
=====================================
src/sub/Detect_Breakpoints.cpp
=====================================
@@ -203,14 +203,14 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
long num_reads = 0;
/*Genotyper * go;
- if (Parameter::Instance()->genotype) {
- go = new Genotyper();
- }*/
- std::cout << "Start parsing... "<<ref[tmp_aln->getRefID()].RefName << std::endl;
+ if (Parameter::Instance()->genotype) {
+ go = new Genotyper();
+ }*/
+ std::cout << "Start parsing... " << ref[tmp_aln->getRefID()].RefName << std::endl;
while (!tmp_aln->getQueryBases().empty()) {
- if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())){// && (Parameter::Instance()->chr_names.empty() || Parameter::Instance()->chr_names.find(ref[tmp_aln->getRefID()].RefName) != Parameter::Instance()->chr_names.end())) {
+ if ((tmp_aln->getAlignment()->IsPrimaryAlignment()) && (!(tmp_aln->getAlignment()->AlignmentFlag & 0x800) && tmp_aln->get_is_save())) { // && (Parameter::Instance()->chr_names.empty() || Parameter::Instance()->chr_names.find(ref[tmp_aln->getRefID()].RefName) != Parameter::Instance()->chr_names.end())) {
//change CHR:
if (current_RefID != tmp_aln->getRefID()) {
@@ -220,13 +220,13 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
bst.get_breakpoints(root, points);
//polish_points(points, ref);
- /* if (Parameter::Instance()->genotype) {
- fclose(ref_allel_reads);
- cout<<"\t\tGenotyping"<<endl;
- go->update_SVs(points, ref_space);
- cout<<"\t\tGenotyping finished"<<endl;
- ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
- }*/
+ /* if (Parameter::Instance()->genotype) {
+ fclose(ref_allel_reads);
+ cout<<"\t\tGenotyping"<<endl;
+ go->update_SVs(points, ref_space);
+ cout<<"\t\tGenotyping finished"<<endl;
+ ref_allel_reads = fopen(Parameter::Instance()->tmp_genotyp.c_str(), "wb");
+ }*/
for (int i = 0; i < points.size(); i++) {
points[i]->calc_support();
@@ -283,6 +283,11 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
tmp.chr_id = tmp_aln->getRefID(); //check string in binary???
tmp.start = tmp_aln->getPosition();
tmp.length = tmp_aln->getRefLength();
+ if (tmp_aln->getStrand()) {
+ tmp.strand = 1;
+ } else {
+ tmp.strand = 2;
+ }
fwrite(&tmp, sizeof(struct str_read), 1, ref_allel_reads);
}
@@ -310,13 +315,13 @@ void detect_breakpoints(std::string read_filename, IPrinter *& printer) {
bst.get_breakpoints(root, points);
/* if (Parameter::Instance()->genotype) {
- fclose(ref_allel_reads);
- go->update_SVs(points, ref_space);
- string del = "rm ";
- del += Parameter::Instance()->tmp_genotyp;
- del += "ref_allele";
- system(del.c_str());
- }*/
+ fclose(ref_allel_reads);
+ go->update_SVs(points, ref_space);
+ string del = "rm ";
+ del += Parameter::Instance()->tmp_genotyp;
+ del += "ref_allele";
+ system(del.c_str());
+ }*/
for (int i = 0; i < points.size(); i++) {
points[i]->calc_support();
@@ -361,79 +366,81 @@ void add_events(Alignment *& tmp, std::vector<str_event> events, short type, lon
bool flag = (strcmp(tmp->getName().c_str(), Parameter::Instance()->read_name.c_str()) == 0);
for (size_t i = 0; i < events.size(); i++) {
- position_str svs;
- read_str read;
- if (events[i].is_noise) {
- read.type = 2;
- } else {
- read.type = 0;
- }
- read.SV = events[i].type;
- read.sequence = events[i].sequence;
+ // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
+ position_str svs;
+ read_str read;
+ if (events[i].is_noise) {
+ read.type = 2;
+ } else {
+ read.type = 0;
+ }
+ read.SV = events[i].type;
+ read.sequence = events[i].sequence;
- if (flag) {
- std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl;
- }
- svs.start.min_pos = (long) events[i].pos + ref_space;
- svs.stop.max_pos = svs.start.min_pos + events[i].length;
+ if (flag) {
+ std::cout << "ADD EVENT " << tmp->getName() << " " << tmp->getRefID() << " " << events[i].pos << " " << abs(events[i].length) << std::endl;
+ }
+ svs.start.min_pos = (long) events[i].pos + ref_space;
+ svs.stop.max_pos = svs.start.min_pos + events[i].length;
- if (tmp->getStrand()) {
- read.strand.first = (tmp->getStrand());
- read.strand.second = !(tmp->getStrand());
- } else {
- read.strand.first = !(tmp->getStrand());
- read.strand.second = (tmp->getStrand());
- }
- // start.support[0].read_start.min = events[i].read_pos;
+ if (tmp->getStrand()) {
+ read.strand.first = (tmp->getStrand());
+ read.strand.second = !(tmp->getStrand());
+ } else {
+ read.strand.first = !(tmp->getStrand());
+ read.strand.second = (tmp->getStrand());
+ }
+ // start.support[0].read_start.min = events[i].read_pos;
- read.read_strand.first = tmp->getStrand();
- read.read_strand.second = tmp->getStrand();
- if (flag) {
- std::cout << tmp->getName() << " " << tmp->getRefID() << " " << svs.start.min_pos << " " << svs.stop.max_pos << " " << svs.stop.max_pos - svs.start.min_pos << std::endl;
- }
+ read.read_strand.first = tmp->getStrand();
+ read.read_strand.second = tmp->getStrand();
+ if (flag) {
+ std::cout << tmp->getName() << " " << tmp->getRefID() << " " << svs.start.min_pos << " " << svs.stop.max_pos << " " << svs.stop.max_pos - svs.start.min_pos << std::endl;
+ }
- if (svs.start.min_pos > svs.stop.max_pos) {
- //can this actually happen?
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
- } else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
- }
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ //can this actually happen?
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
+ } else {
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
+ }
- svs.start.max_pos = svs.start.min_pos;
- svs.stop.min_pos = svs.stop.max_pos;
+ svs.start.max_pos = svs.start.min_pos;
+ svs.stop.min_pos = svs.stop.max_pos;
- if (svs.start.min_pos > svs.stop.max_pos) { //incase they are inverted
- svs_breakpoint_str pos = svs.start;
- svs.start = svs.stop;
- svs.stop = pos;
- pair<bool, bool> tmp = read.strand;
- read.strand.first = tmp.second;
- read.strand.second = tmp.first;
- }
+ if (svs.start.min_pos > svs.stop.max_pos) { //incase they are inverted
+ svs_breakpoint_str pos = svs.start;
+ svs.start = svs.stop;
+ svs.stop = pos;
+ pair<bool, bool> tmp = read.strand;
+ read.strand.first = tmp.second;
+ read.strand.second = tmp.first;
+ }
- //TODO: we might not need this:
- if (svs.start.min_pos > svs.stop.max_pos) {
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
- } else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
- }
+ //TODO: we might not need this:
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
+ } else {
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
+ }
- read.id = read_id;
- svs.support[tmp->getName()] = read;
- svs.support[tmp->getName()].length = events[i].length;
- Breakpoint * point = new Breakpoint(svs, events[i].length);
- if (add) {
- bst.insert_existant(point, root);
- } else {
- bst.insert(point, root);
+ read.id = read_id;
+ svs.support[tmp->getName()] = read;
+ svs.support[tmp->getName()].length = events[i].length;
+ Breakpoint * point = new Breakpoint(svs, events[i].length);
+ if (add) {
+ bst.insert_existant(point, root);
+ } else {
+ bst.insert(point, root);
+ }
+ //std::cout<<"Print:"<<std::endl;
+ //bst.print(root);
}
- //std::cout<<"Print:"<<std::endl;
- //bst.print(root);
- }
+// }
}
void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVector ref, IntervallTree& bst, TNode *&root, long read_id, bool add) {
@@ -452,212 +459,214 @@ void add_splits(Alignment *& tmp, std::vector<aln_str> events, short type, RefVe
}
for (size_t i = 1; i < events.size(); i++) {
- position_str svs;
- //position_str stop;
- read_str read;
- read.sequence = "NA";
- //read.name = tmp->getName();
- read.type = type;
- read.SV = 0;
- read.read_strand.first = events[i - 1].strand;
- read.read_strand.second = events[i].strand;
-
- //stop.support.push_back(read);
- if (events[i].RefID == events[i - 1].RefID) { //IF different chr -> tra
- if (events[i - 1].strand == events[i].strand) { //IF same strand -> del/ins/dup
- if (events[i - 1].strand) {
- read.strand.first = events[i - 1].strand;
- read.strand.second = !events[i].strand;
- } else {
- read.strand.first = !events[i - 1].strand;
- read.strand.second = events[i].strand;
- }
- // int len1 = 0;
- //int len2 = 0;
- svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length;
- svs.read_stop = events[i].read_pos_start;
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
- svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- }
+ // if (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size) {
+ position_str svs;
+ //position_str stop;
+ read_str read;
+ read.sequence = "NA";
+ //read.name = tmp->getName();
+ read.type = type;
+ read.SV = 0;
+ read.read_strand.first = events[i - 1].strand;
+ read.read_strand.second = events[i].strand;
+
+ //stop.support.push_back(read);
+ if (events[i].RefID == events[i - 1].RefID) { //IF different chr -> tra
+ if (events[i - 1].strand == events[i].strand) { //IF same strand -> del/ins/dup
+ if (events[i - 1].strand) {
+ read.strand.first = events[i - 1].strand;
+ read.strand.second = !events[i].strand;
+ } else {
+ read.strand.first = !events[i - 1].strand;
+ read.strand.second = events[i].strand;
+ }
+ // int len1 = 0;
+ //int len2 = 0;
+ svs.read_start = events[i - 1].read_pos_stop; // (short) events[i - 1].read_pos_start + (short) events[i - 1].length;
+ svs.read_stop = events[i].read_pos_start;
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ }
- if (flag) {
- cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " tmp: " << (svs.stop.max_pos - svs.start.min_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: " << svs.read_stop << std::endl;
- }
+ if (flag) {
+ cout << "Debug: SV_Size: " << (svs.start.min_pos - svs.stop.max_pos) << " tmp: " << (svs.stop.max_pos - svs.start.min_pos) << " Ref_start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " Ref_stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref) << " readstart: " << svs.read_start << " readstop: " << svs.read_stop << std::endl;
+ }
- if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
- if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
- svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check!
- if (Parameter::Instance()->print_seq) {
- svs.read_stop = events[i].read_pos_start;
- svs.read_start = events[i - 1].read_pos_stop;
- if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) {
- cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ if ((svs.stop.max_pos - svs.start.min_pos) > Parameter::Instance()->min_length * -1 && ((svs.stop.max_pos - svs.start.min_pos) + (Parameter::Instance()->min_length) < (svs.read_stop - svs.read_start) && (svs.read_stop - svs.read_start) > (Parameter::Instance()->min_length * 2))) {
+ if (!events[i].cross_N || (double) ((svs.stop.max_pos - svs.start.min_pos) + Parameter::Instance()->min_length) < ((double) (svs.read_stop - svs.read_start) * Parameter::Instance()->avg_ins)) {
+ svs.stop.max_pos += (svs.read_stop - svs.read_start); //TODO check!
+ if (Parameter::Instance()->print_seq) {
+ svs.read_stop = events[i].read_pos_start;
+ svs.read_start = events[i - 1].read_pos_stop;
+ if (svs.read_stop > tmp->getAlignment()->QueryBases.size()) {
+ cerr << "BUG: split read ins! " << svs.read_stop << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ }
+ if (!events[i - 1].strand) {
+ std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases);
+
+ read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start));
+ } else {
+ read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start);
+ }
+ if (flag) {
+ cout << "INS: " << endl;
+ cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
+ cout << "Seq+:" << read.sequence << endl;
+ }
}
- if (!events[i - 1].strand) {
- std::string tmp_seq = reverse_complement(tmp->getAlignment()->QueryBases);
+ read.SV |= INS;
+ } else {
+ read.SV |= 'n';
+ }
- read.sequence = reverse_complement(tmp_seq.substr(svs.read_start, svs.read_stop - svs.read_start));
- } else {
- read.sequence = tmp->getAlignment()->QueryBases.substr(svs.read_start, svs.read_stop - svs.read_start);
- }
+ } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
+ if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
+ read.SV |= DEL;
if (flag) {
- cout << "INS: " << endl;
- cout << "split read ins! " << events[i - 1].read_pos_stop << " " << events[i].read_pos_start << " " << " " << tmp->getAlignment()->QueryBases.size() << " " << tmp->getName() << endl;
- cout << "Seq+:" << read.sequence << endl;
+ cout << "DEL2" << endl;
}
+ } else {
+ read.SV |= 'n';
}
- read.SV |= INS;
- } else {
- read.SV |= 'n';
- }
- } else if ((svs.start.min_pos - svs.stop.max_pos) * -1 > (svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length)) {
- if (!events[i].cross_N || (double) (svs.start.min_pos - svs.stop.max_pos) * Parameter::Instance()->avg_del * -1.0 > (double) ((svs.read_stop - svs.read_start) + (Parameter::Instance()->min_length))) {
- read.SV |= DEL;
+ } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads!
if (flag) {
- cout << "DEL2" << endl;
+ cout << "DUP: " << endl;
}
+ read.SV |= DUP;
} else {
- read.SV |= 'n';
+ if (flag) {
+ cout << "N" << endl;
+ }
+ read.SV = 'n';
}
+ } else { // if first part of read is in a different direction as the second part-> INV
- } else if ((svs.start.min_pos - svs.stop.max_pos) > Parameter::Instance()->min_length && (svs.read_start - svs.read_stop) < Parameter::Instance()->min_length) { //check with respect to the coords of reads!
- if (flag) {
- cout << "DUP: " << endl;
- }
- read.SV |= DUP;
- } else {
- if (flag) {
- cout << "N" << endl;
+ read.strand.first = events[i - 1].strand;
+ read.strand.second = !events[i].strand;
+
+ bool is_overlapping = overlaps(events[i - 1], events[i]);
+ if (is_overlapping && (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size)) {
+ if (flag) {
+ std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl;
+ }
+ read.SV |= NEST;
+
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ }
+
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ long tmp = svs.start.min_pos;
+ svs.start.min_pos = svs.stop.max_pos;
+ svs.stop.max_pos = tmp;
+ }
+ } else if (!is_overlapping) {
+ read.SV |= INV;
+ if (events[i - 1].strand) {
+ svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ } else {
+ svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ }
}
- read.SV = 'n';
}
- } else { // if first part of read is in a different direction as the second part-> INV
+ } else { //if not on the same chr-> TRA
read.strand.first = events[i - 1].strand;
read.strand.second = !events[i].strand;
-
- bool is_overlapping = overlaps(events[i - 1], events[i]);
- if (is_overlapping && (events[i - 1].length > Parameter::Instance()->min_segment_size || events[i].length > Parameter::Instance()->min_segment_size)) {
- if (flag) {
- std::cout << "Overlap curr: " << events[i].pos << " " << events[i].pos + events[i].length << " prev: " << events[i - 1].pos << " " << events[i - 1].pos + events[i - 1].length << " " << tmp->getName() << std::endl;
- }
- read.SV |= NEST;
+ if (events[i - 1].strand == events[i].strand) {
+ //check this with + - strands!!
if (events[i - 1].strand) {
svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
} else {
svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
- }
-
- if (svs.start.min_pos > svs.stop.max_pos) {
- long tmp = svs.start.min_pos;
- svs.start.min_pos = svs.stop.max_pos;
- svs.stop.max_pos = tmp;
+ svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
}
- } else if (!is_overlapping) {
- read.SV |= INV;
+ } else {
if (events[i - 1].strand) {
svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = (events[i].pos + events[i].length) + get_ref_lengths(events[i].RefID, ref);
+ svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
} else {
svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
}
}
+ read.SV |= TRA;
}
- } else { //if not on the same chr-> TRA
- read.strand.first = events[i - 1].strand;
- read.strand.second = !events[i].strand;
- if (events[i - 1].strand == events[i].strand) {
- //check this with + - strands!!
-
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
+ if (read.SV != 'n') {
+ if (flag) {
+ std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref);
+ if (events[i - 1].strand) {
+ std::cout << " +";
+ } else {
+ std::cout << " -";
+ }
+ if (events[i].strand) {
+ std::cout << " +";
+ } else {
+ std::cout << " -";
+ }
+ std::cout << " " << tmp->getName() << std::endl;
+ std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl;
}
- } else {
- if (events[i - 1].strand) {
- svs.start.min_pos = events[i - 1].pos + events[i - 1].length + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + events[i].length + get_ref_lengths(events[i].RefID, ref);
- } else {
- svs.start.min_pos = events[i - 1].pos + get_ref_lengths(events[i - 1].RefID, ref);
- svs.stop.max_pos = events[i].pos + get_ref_lengths(events[i].RefID, ref);
+ //std::cout<<"split"<<std::endl;
+
+ svs.start.max_pos = svs.start.min_pos;
+ svs.stop.min_pos = svs.stop.max_pos;
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ //maybe we have to invert the directions???
+ svs_breakpoint_str pos = svs.start;
+ svs.start = svs.stop;
+ svs.stop = pos;
+
+ pair<bool, bool> tmp = read.strand;
+
+ read.strand.first = tmp.second;
+ read.strand.second = tmp.first;
}
- }
- read.SV |= TRA;
- }
- if (read.SV != 'n') {
- if (flag) {
- std::cout << "SPLIT: " << TRANS_type(read.SV) << " start: " << svs.start.min_pos - get_ref_lengths(events[i].RefID, ref) << " stop: " << svs.stop.max_pos - get_ref_lengths(events[i].RefID, ref);
- if (events[i - 1].strand) {
- std::cout << " +";
+ //TODO: we might not need this:
+ if (svs.start.min_pos > svs.stop.max_pos) {
+ read.coordinates.first = svs.stop.max_pos;
+ read.coordinates.second = svs.start.min_pos;
} else {
- std::cout << " -";
+ read.coordinates.first = svs.start.min_pos;
+ read.coordinates.second = svs.stop.max_pos;
}
- if (events[i].strand) {
- std::cout << " +";
+
+ //pool out?
+ read.id = read_id;
+ svs.support[tmp->getName()] = read;
+ svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
+ Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
+ //std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
+ if (add) {
+ bst.insert_existant(point, root);
} else {
- std::cout << " -";
+ bst.insert(point, root);
}
- std::cout << " " << tmp->getName() << std::endl;
- std::cout << "READ: " << svs.read_start << " " << svs.read_stop << " " << svs.read_start - svs.read_stop << std::endl;
- }
- //std::cout<<"split"<<std::endl;
-
- svs.start.max_pos = svs.start.min_pos;
- svs.stop.min_pos = svs.stop.max_pos;
- if (svs.start.min_pos > svs.stop.max_pos) {
- //maybe we have to invert the directions???
- svs_breakpoint_str pos = svs.start;
- svs.start = svs.stop;
- svs.stop = pos;
-
- pair<bool, bool> tmp = read.strand;
-
- read.strand.first = tmp.second;
- read.strand.second = tmp.first;
- }
-
- //TODO: we might not need this:
- if (svs.start.min_pos > svs.stop.max_pos) {
- read.coordinates.first = svs.stop.max_pos;
- read.coordinates.second = svs.start.min_pos;
- } else {
- read.coordinates.first = svs.start.min_pos;
- read.coordinates.second = svs.stop.max_pos;
+ // std::cout<<"Print:"<<std::endl;
+ // bst.print(root);
}
-
- //pool out?
- read.id = read_id;
- svs.support[tmp->getName()] = read;
- svs.support[tmp->getName()].length = abs(read.coordinates.second - read.coordinates.first);
- Breakpoint * point = new Breakpoint(svs, abs(read.coordinates.second - read.coordinates.first));
- //std::cout<<"split ADD: " << <<" Name: "<<tmp->getName()<<" "<< svs.start.min_pos- get_ref_lengths(events[i].RefID, ref)<<"-"<<svs.stop.max_pos- get_ref_lengths(events[i].RefID, ref)<<std::endl;
- if (add) {
- bst.insert_existant(point, root);
- } else {
- bst.insert(point, root);
- }
- // std::cout<<"Print:"<<std::endl;
- // bst.print(root);
}
- }
+ //}
}
void estimate_parameters(std::string read_filename) {
- if(Parameter::Instance()->skip_parameter_estimation){
+ if (Parameter::Instance()->skip_parameter_estimation) {
return;
}
cout << "Estimating parameter..." << endl;
@@ -692,9 +701,10 @@ void estimate_parameters(std::string read_filename) {
double avg_del = 0;
double avg_ins = 0;
vector<int> tmp = tmp_aln->get_avg_diff(dist, avg_del, avg_ins);
+ // std::cout<<"Debug:\t"<<avg_del<<" "<<avg_ins<<endl;
tot_avg_ins += avg_ins;
tot_avg_del += avg_del;
- //std::cout<<"Debug:\t"<<dist<<"\t";
+ //
avg_dist += dist;
double avg_mis = 0;
for (size_t i = 0; i < tmp.size(); i++) {
@@ -749,6 +759,8 @@ void estimate_parameters(std::string read_filename) {
pos = nums.size() * 0.05; //the lowest 5% cuttoff
Parameter::Instance()->score_treshold = 2; //nums[pos]; //prev=2
+ //cout<<"test: "<<tot_avg_ins<<" "<<num<<endl;
+ //cout<<"test2: "<<tot_avg_del<<" "<<num<<endl;
Parameter::Instance()->avg_del = tot_avg_del / num;
Parameter::Instance()->avg_ins = tot_avg_ins / num;
=====================================
src/tree/Breakpoint_Tree.cpp
=====================================
@@ -25,28 +25,36 @@ void Breakpoint_Tree::find(int position, std::string chr, breakpoint_node *par,
}
}
-void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par) {
+void Breakpoint_Tree::overalps(int start, int stop, std::string chr, breakpoint_node *par, bool ref_strand) {
//start + stop: read coordinates.
if (par == NULL) { //not found
return;
}
if (par->direction) { //start
- if ((par->position-100 > start && par->position+100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
- par->ref_support++;
- // std::cout<<"start: "<<start<<" "<<stop<<std::endl;
+ if ((par->position - 100 > start && par->position + 100 < stop) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
+ if (ref_strand) {
+ par->ref_support.first++;
+ } else {
+ par->ref_support.second++;
+ }
+ // std::cout<<"start: "<<start<<" "<<stop<<std::endl;
}
} else { //stop coordinate
- if ((par->position > start+100 && par->position < stop-100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
- par->ref_support++;
- // std::cout<<"stop: "<< start<<" "<<stop<<std::endl;
+ if ((par->position > start + 100 && par->position < stop - 100) && strcmp(chr.c_str(), par->chr.c_str()) == 0) { //found
+ if (ref_strand) {
+ par->ref_support.first++;
+ } else {
+ par->ref_support.second++;
+ }
+ // std::cout<<"stop: "<< start<<" "<<stop<<std::endl;
}
}
//search goes on:
if (start < par->position) {
- overalps(start, stop, chr, par->left);
+ overalps(start, stop, chr, par->left, ref_strand);
} else {
- overalps(start, stop, chr, par->right);
+ overalps(start, stop, chr, par->right, ref_strand);
}
}
@@ -57,7 +65,8 @@ void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int positi
if (tree == NULL) {
tree = new breakpoint_node;
tree->position = position;
- tree->ref_support = 0;
+ tree->ref_support.first = 0;
+ tree->ref_support.second = 0;
tree->chr = chr;
tree->direction = direction;
tree->left = NULL;
@@ -70,19 +79,23 @@ void Breakpoint_Tree::insert(breakpoint_node *&tree, std::string chr, int positi
//std::cerr << "Element exists!" << std::endl;
//TODO we should use this information to assess the reliability of this call!
} else {
- insert(tree->left, chr, position,position); //think about that!
+ insert(tree->left, chr, position, position); //think about that!
}
}
-int Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int position) {
+std::pair<int,int> Breakpoint_Tree::get_ref(breakpoint_node *&tree, std::string chr, int position ) {
if (tree == NULL) {
- return -1;
+ std::pair<int,int> tmp;
+ tmp.first=-1;
+ tmp.second=-1;
+ return tmp;
}
if (tree->position > position) {
return get_ref(tree->left, chr, position);
} else if (tree->position < position) {
return get_ref(tree->right, chr, position);
} else if (strcmp(chr.c_str(), tree->chr.c_str()) == 0) { // found element
+
return tree->ref_support;
} else {
return get_ref(tree->left, chr, position); //just in case.
@@ -231,7 +244,7 @@ void Breakpoint_Tree::inorder(breakpoint_node *ptr) {
}
if (ptr != NULL) {
inorder(ptr->left);
- std::cout << ptr->chr << " " << ptr->position << " " << ptr->ref_support << std::endl;
+ std::cout << ptr->chr << " " << ptr->position << " " << ptr->ref_support.first <<" "<< ptr->ref_support.second<< std::endl;
inorder(ptr->right);
}
}
=====================================
src/tree/Breakpoint_Tree.h
=====================================
@@ -16,7 +16,7 @@ struct breakpoint_node {
std::string chr;
int position; // value to store!
bool direction;
- int ref_support;
+ std::pair<int,int> ref_support;
breakpoint_node *left;
breakpoint_node *right;
};
@@ -40,8 +40,8 @@ public:
void postorder(breakpoint_node *ptr);
void display(breakpoint_node *ptr, int);
void get_nodes(breakpoint_node *ptr, std::vector<int> & nodes);
- void overalps(int start,int stop,std::string chr, breakpoint_node *par);
- int get_ref(breakpoint_node *&tree, std::string chr, int position);
+ void overalps(int start,int stop,std::string chr, breakpoint_node *par,bool ref_strand);
+ std::pair<int,int> get_ref(breakpoint_node *&tree, std::string chr, int position);
};
=====================================
src/tree/IntervallTree.cpp
=====================================
@@ -12,6 +12,7 @@ void IntervallTree::careful_screening(Breakpoint *& new_break, TNode *p) { //may
careful_screening(new_break, p->left);
if (p->get_data()->overlap(new_break) == 0) { //SV type
p->get_data()->add_read(new_break);
+ //cout<<"Merged: "<<endl;
new_break->set_coordinates(-1, -1);
return;
}
@@ -32,6 +33,7 @@ void IntervallTree::insert(Breakpoint * new_break, TNode *&p) {
} else { // find on tree:
long score = p->get_data()->overlap(new_break); //comparison function
if (score == 0) { //add SV types?
+ //cout<<"Merged"<<endl;
p->get_data()->add_read(new_break);
new_break->set_coordinates(-1, -1);
//delete new_break;
=====================================
test_set/README
=====================================
@@ -1,10 +1,11 @@
This is a test set to confirm the correct installation of Sniffles.
Run Sniffles with default parameters:
./sniffles -m reads_region.bam -v test.vcf
+this should take no longer than 1 second.
You should expect to find one deletion at 21:21492142-21492648. You can also compare to the output provided in expected_output.vcf.
-If you have questions or concern please submit an issue on github:
+If you have questions or concern, please submit an issue on github:
https://github.com/fritzsedlazeck/Sniffles/issues
or drop me an email:
=====================================
test_set/test.vcf
=====================================
@@ -0,0 +1,118 @@
+##fileformat=VCFv4.2
+##source=Sniffles
+##fileDate=20180307
+##contig=<ID=1,length=249250621>
+##contig=<ID=2,length=243199373>
+##contig=<ID=3,length=198022430>
+##contig=<ID=4,length=191154276>
+##contig=<ID=5,length=180915260>
+##contig=<ID=6,length=171115067>
+##contig=<ID=7,length=159138663>
+##contig=<ID=8,length=146364022>
+##contig=<ID=9,length=141213431>
+##contig=<ID=10,length=135534747>
+##contig=<ID=11,length=135006516>
+##contig=<ID=12,length=133851895>
+##contig=<ID=13,length=115169878>
+##contig=<ID=14,length=107349540>
+##contig=<ID=15,length=102531392>
+##contig=<ID=16,length=90354753>
+##contig=<ID=17,length=81195210>
+##contig=<ID=18,length=78077248>
+##contig=<ID=19,length=59128983>
+##contig=<ID=20,length=63025520>
+##contig=<ID=21,length=48129895>
+##contig=<ID=22,length=51304566>
+##contig=<ID=X,length=155270560>
+##contig=<ID=Y,length=59373566>
+##contig=<ID=MT,length=16569>
+##contig=<ID=GL000207.1,length=4262>
+##contig=<ID=GL000226.1,length=15008>
+##contig=<ID=GL000229.1,length=19913>
+##contig=<ID=GL000231.1,length=27386>
+##contig=<ID=GL000210.1,length=27682>
+##contig=<ID=GL000239.1,length=33824>
+##contig=<ID=GL000235.1,length=34474>
+##contig=<ID=GL000201.1,length=36148>
+##contig=<ID=GL000247.1,length=36422>
+##contig=<ID=GL000245.1,length=36651>
+##contig=<ID=GL000197.1,length=37175>
+##contig=<ID=GL000203.1,length=37498>
+##contig=<ID=GL000246.1,length=38154>
+##contig=<ID=GL000249.1,length=38502>
+##contig=<ID=GL000196.1,length=38914>
+##contig=<ID=GL000248.1,length=39786>
+##contig=<ID=GL000244.1,length=39929>
+##contig=<ID=GL000238.1,length=39939>
+##contig=<ID=GL000202.1,length=40103>
+##contig=<ID=GL000234.1,length=40531>
+##contig=<ID=GL000232.1,length=40652>
+##contig=<ID=GL000206.1,length=41001>
+##contig=<ID=GL000240.1,length=41933>
+##contig=<ID=GL000236.1,length=41934>
+##contig=<ID=GL000241.1,length=42152>
+##contig=<ID=GL000243.1,length=43341>
+##contig=<ID=GL000242.1,length=43523>
+##contig=<ID=GL000230.1,length=43691>
+##contig=<ID=GL000237.1,length=45867>
+##contig=<ID=GL000233.1,length=45941>
+##contig=<ID=GL000204.1,length=81310>
+##contig=<ID=GL000198.1,length=90085>
+##contig=<ID=GL000208.1,length=92689>
+##contig=<ID=GL000191.1,length=106433>
+##contig=<ID=GL000227.1,length=128374>
+##contig=<ID=GL000228.1,length=129120>
+##contig=<ID=GL000214.1,length=137718>
+##contig=<ID=GL000221.1,length=155397>
+##contig=<ID=GL000209.1,length=159169>
+##contig=<ID=GL000218.1,length=161147>
+##contig=<ID=GL000220.1,length=161802>
+##contig=<ID=GL000213.1,length=164239>
+##contig=<ID=GL000211.1,length=166566>
+##contig=<ID=GL000199.1,length=169874>
+##contig=<ID=GL000217.1,length=172149>
+##contig=<ID=GL000216.1,length=172294>
+##contig=<ID=GL000215.1,length=172545>
+##contig=<ID=GL000205.1,length=174588>
+##contig=<ID=GL000219.1,length=179198>
+##contig=<ID=GL000224.1,length=179693>
+##contig=<ID=GL000223.1,length=180455>
+##contig=<ID=GL000195.1,length=182896>
+##contig=<ID=GL000212.1,length=186858>
+##contig=<ID=GL000222.1,length=186861>
+##contig=<ID=GL000200.1,length=187035>
+##contig=<ID=GL000193.1,length=189789>
+##contig=<ID=GL000194.1,length=191469>
+##contig=<ID=GL000225.1,length=211173>
+##contig=<ID=GL000192.1,length=547496>
+##contig=<ID=NC_007605,length=171823>
+##contig=<ID=hs37d5,length=35477943>
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INV,Description="Inversion">
+##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
+##ALT=<ID=TRA,Description="Translocation">
+##ALT=<ID=INS,Description="Insertion">
+##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate in case of a translocation">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
+##INFO=<ID=MAPQ,Number=1,Type=Integer,Description="Median mapping quality of paired-ends">
+##INFO=<ID=RE,Number=1,Type=Integer,Description="read support">
+##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
+##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the SV">
+##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="Type of approach used to detect SV">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=STD_quant_start,Number=A,Type=Integer,Description="STD of the start breakpoints across the reads.">
+##INFO=<ID=STD_quant_stop,Number=A,Type=Integer,Description="STD of the stop breakpoints across the reads.">
+##INFO=<ID=Kurtosis_quant_start,Number=A,Type=Integer,Description="Kurtosis value of the start breakpoints accross the reads.">
+##INFO=<ID=Kurtosis_quant_stop,Number=A,Type=Integer,Description="Kurtosis value of the stop breakpoints accross the reads.">
+##INFO=<ID=SUPTYPE,Number=1,Type=String,Description="Type by which the variant is supported.(SR,ALN)">
+##INFO=<ID=SUPTYPE,Number=1,Type=String,Description="Type by which the variant is supported.(SR,ALN)">
+##INFO=<ID=STRANDS,Number=A,Type=String,Description="Strand orientation of the adjacency in BEDPE format (DEL:+-, DUP:-+, INV:++/--)">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency.">
+##INFO=<ID=ZMW,Number=A,Type=Integer,Description="Number of ZMWs (Pacbio) supporting SV.">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=DR,Number=1,Type=Integer,Description="# high-quality reference reads">
+##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality variant reads">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT reads_region.bam
+21 21492142 0 N <DEL> . PASS PRECISE;SVMETHOD=Snifflesv1.0.8;CHR2=21;END=21492648;STD_quant_start=0.000000;STD_quant_stop=0.000000;Kurtosis_quant_start=0.572582;Kurtosis_quant_stop=1.417662;SVTYPE=DEL;SUPTYPE=AL,SR;SVLEN=506;STRANDS=+-;RE=48 GT:DR:DV ./.:.:48
View it on GitLab: https://salsa.debian.org/med-team/sniffles/commit/7418012d740c7dc303df9ab996302bc365220467
--
View it on GitLab: https://salsa.debian.org/med-team/sniffles/commit/7418012d740c7dc303df9ab996302bc365220467
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20181015/999cbe25/attachment-0001.html>
More information about the debian-med-commit
mailing list