[med-svn] [Git][med-team/delly][upstream] New upstream version 0.8.5

Andreas Tille gitlab at salsa.debian.org
Fri Oct 2 13:35:06 BST 2020



Andreas Tille pushed to branch upstream at Debian Med / delly


Commits:
77fba0a1 by Andreas Tille at 2020-10-02T14:30:54+02:00
New upstream version 0.8.5
- - - - -


26 changed files:

- + .github/workflows/c-cpp.yml
- + .github/workflows/docker.yml
- .travis.yml
- docker/Dockerfile → Dockerfile
- + R/gcbias.R
- + R/rd.R
- README.md
- − docker/README.md
- singularity/delly.def
- + src/bed.h
- src/cluster.h
- + src/cnv.h
- + src/coral.h
- src/coverage.h
- src/delly.cpp
- src/delly.h
- + src/gcbias.h
- src/genotype.h
- src/merge.h
- src/modvcf.h
- src/needle.h
- + src/scan.h
- src/tags.h
- src/tegua.h
- src/util.h
- src/version.h


Changes:

=====================================
.github/workflows/c-cpp.yml
=====================================
@@ -0,0 +1,19 @@
+name: C/C++ CI
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout at v2
+    - name: make
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y libcurl4-gnutls-dev libhts-dev libhts2 libboost-date-time-dev libboost-program-options-dev libboost-system-dev libboost-filesystem-dev libboost-iostreams-dev
+        make


=====================================
.github/workflows/docker.yml
=====================================
@@ -0,0 +1,20 @@
+name: Docker CI
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout at v2
+    - uses: docker/build-push-action at v1
+      with:
+        username: ${{ secrets.DOCKER_USERNAME }}
+        password: ${{ secrets.DOCKER_PASSWORD }}
+        repository: dellytools/delly
+        tags: latest


=====================================
.travis.yml
=====================================
@@ -54,9 +54,6 @@ matrix:
             - ubuntu-toolchain-r-test
           packages:
             - g++-9
-    - env: MYCC="gcc" MYCXX="g++"
-      os: osx
-      osx_image: xcode8.3
     - env: MYCC="gcc" MYCXX="g++"
       os: osx
       osx_image: xcode11.3


=====================================
docker/Dockerfile → Dockerfile
=====================================


=====================================
R/gcbias.R
=====================================
@@ -0,0 +1,20 @@
+library(ggplot2)
+library(reshape2)
+
+args = commandArgs(trailingOnly=TRUE)
+x = read.table(args[1], header=T)
+x = read.table("gc.table", header=T)
+x$gc = x$gcsum / (nrow(x)-1)
+x$fractionSample = x$fractionSample * 100
+x$fractionReference = x$fractionReference * 100
+df = melt(x[,c("gc","fractionSample","fractionReference")], id.vars=c("gc"))
+
+# Whole genome
+p = ggplot(data=df, aes(x=gc, y=value))
+p = p + geom_bar(aes(color=variable, fill=variable), stat="identity")
+p = p + xlab("GC content")
+p = p + ylab("Obs / Exp")
+p = p + ylim(0, max(max(x$obsexp), max(x$fractionSample + x$fractionReference)))
+p = p + geom_line(data=x, aes(x=gc, y=obsexp), color="black")
+ggsave(p, file="gcbias.png", width=12, height=6)
+print(warnings())


=====================================
R/rd.R
=====================================
@@ -0,0 +1,43 @@
+library(ggplot2)
+library(scales)
+library(gtable)
+library(grid)
+
+chrNamesLong = c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20", "chr21", "chr22", "chrX")
+chrNamesShort = c("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18","19","20","21","22","X")
+
+args = commandArgs(trailingOnly=TRUE)
+x = read.table(args[1], header=T)
+maxCN = 8
+
+# Fix chromosome ordering
+if (sum(x$chr %in% chrNamesLong) > sum(x$chr %in% chrNamesShort)) { chrs = chrNamesLong; } else { chrs = chrNamesShort; }
+x = x[x$chr %in% chrs,]
+x$chr = factor(x$chr, levels=chrs)
+
+# Whole genome
+p = ggplot(data=x, aes(x=start, y=x[,6]))
+p = p + geom_point(pch=21, size=0.5)
+p = p + xlab("Chromosome")
+p = p + ylab("Copy-number")
+p = p + scale_x_continuous(labels=comma)
+p = p + facet_grid(. ~ chr, scales="free_x", space="free_x")
+p = p + ylim(0, maxCN)
+p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
+ggsave(p, file="plot.wholegenome.png", width=24, height=6)
+print(warnings())
+
+# By chromosome
+for(chrname in unique(x$chr)) {
+ print(chrname)
+ sub = x[x$chr == chrname,]
+ p = ggplot(data=sub, aes(x=start, y=sub[,6]))
+ p = p + geom_point(pch=21, size=0.5)
+ p = p + ylab("Copy-number") + xlab(chrname)
+ p = p + scale_x_continuous(labels=comma, breaks = scales::pretty_breaks(n=20))
+ p = p + ylim(0, maxCN)
+ p = p + theme(axis.text.x = element_text(angle=45, hjust=1))
+ ggsave(p, file=paste0("plot.", chrname, ".png"), width=24, height=6)
+ print(warnings())
+}
+


=====================================
README.md
=====================================
@@ -7,11 +7,10 @@
 
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/delly/README.html)
 [![Anaconda-Server Badge](https://anaconda.org/bioconda/delly/badges/downloads.svg)](https://anaconda.org/bioconda/delly)
-[![Build Status](https://travis-ci.org/dellytools/delly.svg?branch=master)](https://travis-ci.org/dellytools/delly)
-[![Docker Build](https://img.shields.io/docker/build/dellytools/delly.svg)](https://hub.docker.com/r/dellytools/delly/)
+[![C/C++ CI](https://github.com/dellytools/delly/workflows/C/C++%20CI/badge.svg)](https://github.com/dellytools/delly/actions)
+[![Docker CI](https://github.com/dellytools/delly/workflows/Docker%20CI/badge.svg)](https://hub.docker.com/r/dellytools/delly/)
 [![GitHub license](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://github.com/dellytools/delly/blob/master/LICENSE)
 [![GitHub Releases](https://img.shields.io/github/release/dellytools/delly.svg)](https://github.com/dellytools/delly/releases)
-[![GitHub Issues](https://img.shields.io/github/issues/dellytools/delly.svg)](https://github.com/dellytools/delly/issues)
 
 Delly is an integrated structural variant (SV) prediction method that can discover, genotype and visualize deletions, tandem duplications, inversions and translocations at single-nucleotide resolution in short-read massively parallel sequencing data. It uses paired-ends, split-reads and read-depth to sensitively and accurately delineate genomic rearrangements throughout the genome. Structural variants can be visualized using [Delly-maze](https://github.com/dellytools/maze) and [Delly-suave](https://github.com/dellytools/suave).
 
@@ -37,9 +36,6 @@ Delly supports parallel computing using the OpenMP API (www.openmp.org).
 
 `make PARALLEL=1 -B src/delly`
 
-There is also a statically linked, multi-threaded binary for Linux 64-bit available under [releases](https://github.com/dellytools/delly/releases/).
-
-
 You can set the number of threads using the environment variable OMP_NUM_THREADS.
 
 `export OMP_NUM_THREADS=2`
@@ -58,6 +54,16 @@ Delly needs a sorted, indexed and duplicate marked bam file for every input samp
 `bcftools view delly.bcf > delly.vcf`
 
 
+Delly for long reads from PacBio or ONT (experimental)
+------------------------------------------------------
+
+Delly also has a long-read SV discovery mode.
+
+`delly lr -y ont -g hg19.fa -x hg19.excl input.bam`
+
+`delly lr -y pb -g hg19.fa -x hg19.excl input.bam`
+
+
 Somatic SV calling
 ------------------
 
@@ -104,34 +110,53 @@ Germline SV calling
 
 `delly filter -f germline -o germline.bcf merged.bcf`
 
+
+Read-depth profiles
+-------------------
+
+You can generate read-depth profiles with delly. This requires a mappability map which can be downloaded here:
+
+[Mappability Maps](https://gear.embl.de/data/delly/)
+
+The command to count reads in 10kbp windows and normalize the coverage is:
+
+`delly rd -a -g hg19.fa -m hg19.map input.bam`
+
+The output file can be plotted using R to generate normalized copy-number profiles:
+
+`Rscript R/rd.R out.cov.gz`
+
+The GC bias can be visualized using the stats output.
+
+`delly rd -s stats.gz -g hg19.fa -m hg19.map input.bam`
+
+`zgrep "^GC" stats.gz  > gc.table`
+
+`Rscript R/gcbias.R gc.table`
+
+
 FAQ
 ---
 * What is the smallest SV size Delly can call?  
-This depends on the sharpness of the insert size distribution. For an insert size of 200-300bp with a 20-30bp standard deviation, Delly starts to call reliable SVs >=300bp. Delly also supports calling of small InDels using soft-clipped reads only (-i option). In this mode the smallest SV size called is 15bp.
-
-* Is Delly read-group aware?   
-Yes. If you want to estimate separate insert size distributions for every read-group then use the -e option.
+This depends on the sharpness of the insert size distribution. For an insert size of 200-300bp with a 20-30bp standard deviation, Delly starts to call reliable SVs >=300bp. Delly also supports calling of small InDels using soft-clipped reads only, the smallest SV size called is 15bp.
 
 * Can Delly be used on a non-diploid genome?  
 Yes and no. The SV site discovery works for any ploidy. However, Delly's genotyping model assumes diploidy (hom. reference, het. and hom. alternative).
 
-* How do I run Delly if I have multiple different libraries/bam files for a single sample?    
-Merge these BAMs using tools such as [Picard](http://broadinstitute.github.io/picard/) and tag each library with a unique ReadGroup. If you have a sample with multiple read-groups please run delly with the -e option.
-
 * Delly is running too slowly what can I do?    
-You should exclude telomere and centromere regions and also all unplaced contigs. Delly ships with such an exclude list for human and mouse samples. In addition, you can filter input reads more stringently using -q 20 and -s 15. Small InDel calling takes more time (-i option), leave it out if you are only interested in large SVs (>=300bp).
+You should exclude telomere and centromere regions and also all unplaced contigs. Delly ships with such an exclude list for human and mouse samples. In addition, you can filter input reads more stringently using -q 20 and -s 15.
 
 * Are non-unique alignments, multi-mappings and/or multiple split-read alignments allowed?  
-Delly expects two alignment records in the bam file for every paired-end, one for the first and one for the second read. Multiple split-read alignment records of a given read are allowed if and only if one of them (e.g. the longest split alignment) is a primary alignment whereas all others are marked as secondary or supplementary (flag 0x0100 or flag 0x0800). This is the default for bwa mem.
+Delly expects two alignment records in the bam file for every paired-end, one for the first and one for the second read. Multiple split-read alignment records of a given read are allowed if and only if one of them is a primary alignment whereas all others are marked as secondary or supplementary (flag 0x0100 or flag 0x0800). This is the default for bwa mem.
 
 * What pre-processing of bam files is required?    
-Bam files need to be sorted, indexed and ideally duplicate marked. If multiple libraries are present for a single sample these need to be merged in a single bam file with unique ReadGroup tags.
+Bam files need to be sorted, indexed and ideally duplicate marked.
 
 * Usage/discussion mailing list?         
 There is a delly discussion group [delly-users](http://groups.google.com/d/forum/delly-users).
 
-* Docker support?            
-There is a dockerized delly available [here](https://hub.docker.com/r/dellytools/delly/).
+* Docker/Singularity support?            
+There is a dockerized delly available [here](https://hub.docker.com/r/dellytools/delly/) and singularity containers (*.sif files) are part of the [delly release](https://github.com/dellytools/delly/releases).
 
 * Bioconda support?             
 Delly is available via [bioconda](http://bioconda.github.io/recipes/delly/README.html).


=====================================
docker/README.md deleted
=====================================
@@ -1,18 +0,0 @@
-Dockerized Delly
-================
-
-[Delly](https://github.com/dellytools/delly) is available as a [Docker image](https://hub.docker.com/r/dellytools/delly/). You can pull the Delly image using
-
-`docker pull dellytools/delly`
-
-You can then run Delly from that image. Below we assume your alignment files in BAM format are in /var/data which is mounted as /root in the docker container.
-
-`docker run -it -v /var/data/:/root dellytools/delly`
-
-The container by default starts a shell which you can then use to run Delly.
-
-`delly call -o /root/sv.bcf -g /root/ref.fa /root/control.bam /root/tumor.bam`
-
-Once Delly is finished you can just exit the Docker image.
-
-`exit`


=====================================
singularity/delly.def
=====================================
@@ -17,7 +17,7 @@ Stage: build
 
 # Final image
 BootStrap: library
-From: alpine:3.9
+From: alpine:latest
 Stage: final
 
 %files from build


=====================================
src/bed.h
=====================================
@@ -0,0 +1,119 @@
+#ifndef BED_H
+#define BED_H
+
+#include <boost/filesystem.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/progress.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/distributions/hypergeometric.hpp>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+
+#include <htslib/faidx.h>
+#include <htslib/vcf.h>
+#include <htslib/sam.h>
+
+namespace torali
+{
+
+  // Flattens overlapping intervals
+  template<typename TRegionsGenome>
+  inline int32_t
+    _parseBedIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) {
+    typedef typename TRegionsGenome::value_type TChrIntervals;
+    typedef typename TChrIntervals::interval_type TIVal;
+
+    int32_t intervals = 0;
+    if (filePresent) {
+      bedRegions.resize(hdr->n_targets, TChrIntervals());
+      std::ifstream chrFile(filename.c_str(), std::ifstream::in);
+      if (chrFile.is_open()) {
+	while (chrFile.good()) {
+	  std::string chrFromFile;
+	  getline(chrFile, chrFromFile);
+	  typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
+	  boost::char_separator<char> sep(" \t,;");
+	  Tokenizer tokens(chrFromFile, sep);
+	  Tokenizer::iterator tokIter = tokens.begin();
+	  if (tokIter!=tokens.end()) {
+	    std::string chrName = *tokIter++;
+	    int32_t tid = bam_name2id(hdr, chrName.c_str());
+	    if (tid >= 0) {
+	      if (tokIter!=tokens.end()) {
+		int32_t start = boost::lexical_cast<int32_t>(*tokIter++);
+		int32_t end = boost::lexical_cast<int32_t>(*tokIter++);
+		bedRegions[tid].insert(TIVal::right_open(start, end));
+		++intervals;
+	      }
+	    }
+	  }
+	}
+      }
+      chrFile.close();
+    }
+    return intervals;
+  }
+
+
+  // Keeps overlapping intervals
+  template<typename TRegionsGenome>
+  inline int32_t
+  _parsePotOverlappingIntervals(std::string const& filename, bool const filePresent, bam_hdr_t* hdr, TRegionsGenome& bedRegions) {
+    typedef typename TRegionsGenome::value_type TChrIntervals;
+	
+    int32_t intervals = 0;
+    if (filePresent) {
+      bedRegions.resize(hdr->n_targets, TChrIntervals());
+      std::ifstream chrFile(filename.c_str(), std::ifstream::in);
+      if (chrFile.is_open()) {
+	while (chrFile.good()) {
+	  std::string chrFromFile;
+	  getline(chrFile, chrFromFile);
+	  typedef boost::tokenizer< boost::char_separator<char> > Tokenizer;
+	  boost::char_separator<char> sep(" \t,;");
+	  Tokenizer tokens(chrFromFile, sep);
+	  Tokenizer::iterator tokIter = tokens.begin();
+	  if (tokIter!=tokens.end()) {
+	    std::string chrName = *tokIter++;
+	    int32_t tid = bam_name2id(hdr, chrName.c_str());
+	    if (tid >= 0) {
+	      if (tokIter!=tokens.end()) {
+		int32_t start = boost::lexical_cast<int32_t>(*tokIter++);
+		int32_t end = boost::lexical_cast<int32_t>(*tokIter++);
+		bedRegions[tid].insert(std::make_pair(start, end));
+		++intervals;
+	      }
+	    }
+	  }
+	}
+      }
+      chrFile.close();
+    }
+    return intervals;
+  }
+
+
+  template<typename TChrIntervals>
+  inline void
+  _mergeOverlappingBedEntries(TChrIntervals const& bedRegions, TChrIntervals& citv) {
+    typedef boost::icl::interval_set<uint32_t> TUniqueIntervals;
+    typedef typename TUniqueIntervals::interval_type TIVal;
+    TUniqueIntervals uitv;
+
+    // Insert intervals
+    for(typename TChrIntervals::const_iterator it = bedRegions.begin(); it != bedRegions.end(); ++it) uitv.insert(TIVal::right_open(it->first, it->second));
+    
+    // Fetch unique intervals
+    for(typename TUniqueIntervals::iterator it = uitv.begin(); it != uitv.end(); ++it) citv.insert(std::make_pair(it->lower(), it->upper()));
+  }
+}
+
+#endif


=====================================
src/cluster.h
=====================================
@@ -215,9 +215,9 @@ namespace torali
   }
 
 
-  template<typename TCompEdgeList>
+  template<typename TConfig, typename TCompEdgeList>
   inline void
-  _searchCliques(TCompEdgeList& compEdge, std::vector<SRBamRecord>& br, std::vector<StructuralVariantRecord>& sv, uint32_t const wiggle, int32_t const svt) {
+  _searchCliques(TConfig const& c, TCompEdgeList& compEdge, std::vector<SRBamRecord>& br, std::vector<StructuralVariantRecord>& sv, uint32_t const wiggle, int32_t const svt) {
     typedef typename TCompEdgeList::mapped_type TEdgeList;
     typedef typename TEdgeList::value_type TEdgeRecord;
     typedef typename TEdgeRecord::TVertexType TVertex;
@@ -285,8 +285,8 @@ namespace torali
 	}
       }
 
-      // At least 2 split reads?
-      if (clique.size()>1) {
+      // Enough split reads?
+      if (clique.size() >= c.minCliqueSize) {
 	int32_t svStart = (int32_t) (pos / (uint64_t) clique.size());
 	int32_t svEnd = (int32_t) (pos2 / (uint64_t) clique.size());
 	int32_t svInsLen = (int32_t) (inslen / (int32_t) clique.size());
@@ -338,7 +338,7 @@ namespace torali
 	    // Clean edge lists
 	    if (!compEdge.empty()) {
 	      // Search cliques
-	      _searchCliques(compEdge, br, sv, varisize, svt);
+	      _searchCliques(c, compEdge, br, sv, varisize, svt);
 	      lastConnectedNodeStart = lastConnectedNode;
 	      compEdge.clear();
 	    }
@@ -409,16 +409,16 @@ namespace torali
       }
       // Search cliques
       if (!compEdge.empty()) {
-	_searchCliques(compEdge, br, sv, varisize, svt);
+	_searchCliques(c, compEdge, br, sv, varisize, svt);
 	compEdge.clear();
       }
     }
   }
 
 
-  template<typename TCompEdgeList, typename TBamRecord, typename TSVs>
+  template<typename TConfig, typename TCompEdgeList, typename TBamRecord, typename TSVs>
   inline void
-  _searchCliques(TCompEdgeList& compEdge, TBamRecord const& bamRecord, TSVs& svs, int32_t const svt) {
+  _searchCliques(TConfig const& c, TCompEdgeList& compEdge, TBamRecord const& bamRecord, TSVs& svs, int32_t const svt) {
     typedef typename TCompEdgeList::mapped_type TEdgeList;
     typedef typename TEdgeList::value_type TEdgeRecord;
 
@@ -459,8 +459,9 @@ namespace torali
 	  else incompatible.insert(v);
 	}
       }
-      
-      if ((clique.size()>1) && (_svSizeCheck(svStart, svEnd, svt))) {
+
+      // Enough paired-ends
+      if ((clique.size() >= c.minCliqueSize) && (_svSizeCheck(svStart, svEnd, svt))) {
 	StructuralVariantRecord svRec;
 	svRec.chr = clusterRefID;
 	svRec.chr2 = clusterMateRefID;
@@ -520,7 +521,7 @@ namespace torali
       if (bamItIndex > lastConnectedNode) {
 	// Clean edge lists
 	if (!compEdge.empty()) {
-	  _searchCliques(compEdge, bamRecord, svs, svt);
+	  _searchCliques(c, compEdge, bamRecord, svs, svt);
 	  lastConnectedNodeStart = lastConnectedNode;
 	  compEdge.clear();
 	}
@@ -591,7 +592,7 @@ namespace torali
       }
     }
     if (!compEdge.empty()) {
-      _searchCliques(compEdge, bamRecord, svs, svt);
+      _searchCliques(c, compEdge, bamRecord, svs, svt);
       compEdge.clear();
     }
   }


=====================================
src/cnv.h
=====================================
@@ -0,0 +1,112 @@
+#ifndef CNV_H
+#define CNV_H
+
+#include <boost/filesystem.hpp>
+#include <boost/multi_array.hpp>
+#include <boost/progress.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/math/distributions/chi_squared.hpp>
+#include <boost/math/distributions/hypergeometric.hpp>
+#include <boost/iostreams/filtering_streambuf.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/copy.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/zlib.hpp>
+
+#include <htslib/faidx.h>
+#include <htslib/vcf.h>
+#include <htslib/sam.h>
+
+
+namespace torali
+{
+
+  struct CNV {
+    int32_t chr;
+    int32_t start;
+    int32_t end;
+    int32_t ciposlow;
+    int32_t ciposhigh;
+    int32_t ciendlow;
+    int32_t ciendhigh;
+    uint32_t srleft;
+    uint32_t srright;
+    uint32_t nsnps;
+    float cn;
+    float rdsupport;
+    float penalty;
+    float mappable;
+    double maf;
+    
+  CNV(int32_t const c, int32_t const s, int32_t const e, int32_t const cil, int32_t const cih, int32_t const cel, int32_t ceh, float const estcn, float const sp, float const pty, float const mp) : chr(c), start(s), end(e), ciposlow(cil), ciposhigh(cih), ciendlow(cel), ciendhigh(ceh), srleft(0), srright(0), nsnps(0), cn(estcn), rdsupport(sp), penalty(pty), mappable(mp), maf(0) {}
+  };
+
+    template<typename TCNV>
+    struct SortCNVs : public std::binary_function<TCNV, TCNV, bool>
+      {
+	inline bool operator()(TCNV const& sv1, TCNV const& sv2) {
+	  return ((sv1.chr<sv2.chr) || ((sv1.chr==sv2.chr) && (sv1.start<sv2.start)) || ((sv1.chr==sv2.chr) && (sv1.start==sv2.start) && (sv1.end<sv2.end)) || ((sv1.chr==sv2.chr) && (sv1.start==sv2.start) && (sv1.end==sv2.end) && (sv1.penalty > sv2.penalty)));
+	}
+      };
+      
+  
+  template<typename TConfig, typename TGcBias, typename TCoverage>
+  inline void
+  callCNVs(TConfig const& c, std::pair<uint32_t, uint32_t> const& gcbound, std::vector<uint16_t> const& gcContent, std::vector<uint16_t> const& uniqContent, TGcBias const& gcbias, TCoverage const& cov, bam_hdr_t const* hdr, int32_t const refIndex) {
+    
+    // cnv tiling windows, window_offset smallest window, window_size largest window
+    uint32_t numTiling = 0;
+    {
+      uint32_t winbound = c.window_offset;
+      while (winbound < c.window_size) {
+	++numTiling;
+	winbound *= 2;
+      }
+    }
+
+    typedef std::vector<int16_t> TCNVec;
+    typedef std::vector<TCNVec> TTilingCN;
+    TTilingCN cnvec(numTiling, TCNVec());
+    for(uint32_t i = 0; i < numTiling; ++i) cnvec[i].resize(hdr->target_len[refIndex]/50, -1);
+    for(uint32_t start = 0; start < hdr->target_len[refIndex]; start = start + 50) {
+      if (start + c.window_offset < hdr->target_len[refIndex]) {
+	double covsum = 0;
+	double expcov = 0;
+	uint32_t winlen = 0;
+	uint32_t winbound = c.window_offset;
+	uint32_t tilingPos = 0;
+	for(uint32_t pos = start; ((pos < start + c.window_size) && (pos < hdr->target_len[refIndex])); ++pos) {
+	  if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+	    covsum += cov[pos];
+	    expcov += gcbias[gcContent[pos]].coverage;
+	    ++winlen;
+	  }
+	  // Multiple of window size?
+	  if ((pos - start) == winbound) {
+	    if (winlen >= c.fracWindow * (pos - start)) {
+	      cnvec[tilingPos][start/hdr->target_len[refIndex]] = (int32_t) boost::math::round(c.ploidy * covsum / expcov * 100.0);
+	    }
+	    winbound *= 2;
+	    ++tilingPos;
+	  }
+	}
+      }
+    }
+
+    // Iterate CNVs
+    for(int32_t i = numTiling - 1; i>=0; --i) {
+      for(uint32_t k = 0; k < cnvec[i].size(); ++k) {
+	std::cout << cnvec[i][k] << ',';
+      }
+      std::cout << std::endl;
+    }
+  }
+  
+
+}
+
+#endif


=====================================
src/coral.h
=====================================
@@ -0,0 +1,722 @@
+#ifndef CORAL_H
+#define CORAL_H
+
+#include <limits>
+
+#include <boost/icl/split_interval_map.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/date_time/gregorian/gregorian.hpp>
+#include <boost/math/special_functions/round.hpp>
+#include <boost/progress.hpp>
+
+#include <htslib/sam.h>
+#include <htslib/faidx.h>
+
+#include "bed.h"
+#include "scan.h"
+#include "gcbias.h"
+#include "cnv.h"
+#include "version.h"
+
+namespace torali
+{
+
+  struct CountDNAConfig {
+    bool adaptive;
+    bool hasPanelFile;
+    bool hasStatsFile;
+    bool hasBedFile;
+    bool hasScanFile;
+    bool noScanWindowSelection;
+    uint32_t nchr;
+    uint32_t meanisize;
+    uint32_t window_size;
+    uint32_t window_offset;
+    uint32_t scanWindow;
+    uint32_t minChrLen;
+    uint16_t minQual;
+    uint16_t mad;
+    uint16_t ploidy;
+    float exclgc;
+    float uniqueToTotalCovRatio;
+    float fracWindow;
+    float fragmentUnique;
+    float controlMaf;
+    std::string sampleName;
+    boost::filesystem::path outfile;
+    boost::filesystem::path panelfile;
+    boost::filesystem::path genome;
+    boost::filesystem::path statsFile;
+    boost::filesystem::path mapFile;
+    boost::filesystem::path bamFile;
+    boost::filesystem::path bedFile;
+    boost::filesystem::path scanFile;
+  };
+
+  struct CountDNAConfigLib {
+    uint16_t madCutoff;
+    uint16_t madNormalCutoff;
+    boost::filesystem::path genome;
+    std::vector<boost::filesystem::path> files;
+  };
+  
+  template<typename TConfig>
+  inline int32_t
+  bamCount(TConfig const& c, LibraryInfo const& li, std::vector<GcBias> const& gcbias, std::pair<uint32_t, uint32_t> const& gcbound) {
+    // Load bam file
+    samFile* samfile = sam_open(c.bamFile.string().c_str(), "r");
+    hts_set_fai_filename(samfile, c.genome.string().c_str());
+    hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str());
+    bam_hdr_t* hdr = sam_hdr_read(samfile);
+
+    // BED regions
+    typedef std::set<std::pair<uint32_t, uint32_t> > TChrIntervals;
+    typedef std::vector<TChrIntervals> TRegionsGenome;
+    TRegionsGenome bedRegions;
+    if (c.hasBedFile) {
+      if (!_parsePotOverlappingIntervals(c.bedFile.string(), c.hasBedFile, hdr, bedRegions)) {
+	std::cerr << "Couldn't parse BED intervals. Do the chromosome names match?" << std::endl;
+	return 1;
+      }
+    }
+
+    // Parse BAM file
+    boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+    std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Count fragments" << std::endl;
+    boost::progress_display show_progress( hdr->n_targets );
+
+    // Open output files
+    boost::iostreams::filtering_ostream dataOut;
+    dataOut.push(boost::iostreams::gzip_compressor());
+    dataOut.push(boost::iostreams::file_sink(c.outfile.c_str(), std::ios_base::out | std::ios_base::binary));
+    dataOut << "chr\tstart\tend\t" << c.sampleName << "_mappable\t" << c.sampleName << "_counts\t" << c.sampleName << "_CN" << std::endl;
+
+    boost::iostreams::filtering_ostream panelOut;
+    if (c.hasPanelFile) {
+      panelOut.push(boost::iostreams::gzip_compressor());
+      panelOut.push(boost::iostreams::file_sink(c.panelfile.c_str(), std::ios_base::out | std::ios_base::binary));
+    }
+    
+    // Iterate chromosomes
+    faidx_t* faiMap = fai_load(c.mapFile.string().c_str());
+    faidx_t* faiRef = fai_load(c.genome.string().c_str());
+    for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) {
+      ++show_progress;
+      if (chrNoData(c, refIndex, idx)) continue;
+
+      // Check presence in mappability map
+      std::string tname(hdr->target_name[refIndex]);
+      int32_t seqlen = faidx_seq_len(faiMap, tname.c_str());
+      if (seqlen == - 1) continue;
+      else seqlen = -1;
+      char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen);
+
+      // Check presence in reference
+      seqlen = faidx_seq_len(faiRef, tname.c_str());
+      if (seqlen == - 1) continue;
+      else seqlen = -1;
+      char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen);
+
+      // Get GC and Mappability
+      std::vector<uint16_t> uniqContent(hdr->target_len[refIndex], 0);
+      std::vector<uint16_t> gcContent(hdr->target_len[refIndex], 0);
+      {
+	// Mappability map
+	typedef boost::dynamic_bitset<> TBitSet;
+	TBitSet uniq(hdr->target_len[refIndex], false);
+	for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	  if (seq[i] == 'C') uniq[i] = 1;
+	}
+
+	// GC map
+	typedef boost::dynamic_bitset<> TBitSet;
+	TBitSet gcref(hdr->target_len[refIndex], false);
+	for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	  if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1;
+	}
+
+	// Sum across fragment
+	int32_t halfwin = (int32_t) (c.meanisize / 2);
+	int32_t usum = 0;
+	int32_t gcsum = 0;
+	for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) {
+	  if (pos == halfwin) {
+	    for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) {
+	      usum += uniq[i];
+	      gcsum += gcref[i];
+	    }
+	  } else {
+	    usum -= uniq[pos - halfwin - 1];
+	    gcsum -= gcref[pos - halfwin - 1];
+	    usum += uniq[pos + halfwin];
+	    gcsum += gcref[pos + halfwin];
+	  }
+	  gcContent[pos] = gcsum;
+	  uniqContent[pos] = usum;
+	}
+      }
+      
+      // Coverage track
+      typedef uint16_t TCount;
+      uint32_t maxCoverage = std::numeric_limits<TCount>::max();
+      typedef std::vector<TCount> TCoverage;
+      TCoverage cov(hdr->target_len[refIndex], 0);
+
+      {
+	// Mate map
+	typedef boost::unordered_map<std::size_t, bool> TMateMap;
+	TMateMap mateMap;
+	
+	// Count reads
+	hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]);
+	bam1_t* rec = bam_init1();
+	int32_t lastAlignedPos = 0;
+	std::set<std::size_t> lastAlignedPosReads;
+	while (sam_itr_next(samfile, iter, rec) >= 0) {
+	  if (rec->core.flag & (BAM_FQCFAIL | BAM_FDUP | BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) continue;
+	  if (rec->core.qual < c.minQual) continue;	  
+	  if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue;
+
+	  int32_t midPoint = rec->core.pos + halfAlignmentLength(rec);
+	  if (rec->core.flag & BAM_FPAIRED) {
+	    // Clean-up the read store for identical alignment positions
+	    if (rec->core.pos > lastAlignedPos) {
+	      lastAlignedPosReads.clear();
+	      lastAlignedPos = rec->core.pos;
+	    }
+	    
+	    if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) {
+	      // First read
+	      lastAlignedPosReads.insert(hash_string(bam_get_qname(rec)));
+	      std::size_t hv = hash_pair(rec);
+	      mateMap[hv] = true;
+	      continue;
+	    } else {
+	      // Second read
+	      std::size_t hv = hash_pair_mate(rec);
+	      if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded
+	      mateMap[hv] = false;
+	    }
+	    
+	    // update midpoint
+	    int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos;
+	    if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2);
+	  }
+	  
+	  // Count fragment
+	  if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint];
+	}
+	// Clean-up
+	if (seq != NULL) free(seq);
+	if (ref != NULL) free(ref);
+	bam_destroy1(rec);
+	hts_itr_destroy(iter);
+      }
+
+
+      // Call CNVs
+      //callCNVs(c, gcbound, gcContent, uniqContent, gcbias, cov, hdr, refIndex);
+
+
+      // BED File (target intervals)
+      if (c.hasBedFile) {
+	if (c.adaptive) {
+	  // Merge overlapping BED entries
+	  TChrIntervals citv;
+	  _mergeOverlappingBedEntries(bedRegions[refIndex], citv);
+
+	  // Tile merged intervals
+	  double covsum = 0;
+	  double expcov = 0;
+	  double obsexp = 0;
+	  uint32_t winlen = 0;
+	  uint32_t start = 0;
+	  bool endOfWindow = true;
+	  typename TChrIntervals::iterator it = citv.begin();
+	  if (it != citv.end()) start = it->first;
+	  while(endOfWindow) {
+	    endOfWindow = false;
+	    for(it = citv.begin(); ((it != citv.end()) && (!endOfWindow)); ++it) {
+	      if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) {
+		if (start >= it->second) {
+		  if (start == it->second) {
+		    // Special case
+		    typename TChrIntervals::iterator itNext = it;
+		    ++itNext;
+		    if (itNext != citv.end()) start = itNext->first;
+		  }
+		  continue;
+		}
+		for(uint32_t pos = it->first; ((pos < it->second) && (!endOfWindow)); ++pos) {
+		  if (pos < start) continue;
+		  if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+		    covsum += cov[pos];
+		    obsexp += gcbias[gcContent[pos]].obsexp;
+		    expcov += gcbias[gcContent[pos]].coverage;
+		    ++winlen;
+		    if (winlen == c.window_size) {
+		      obsexp /= (double) winlen;
+		      double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		      double cn = c.ploidy * covsum / expcov;
+		      dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		      // reset
+		      covsum = 0;
+		      expcov = 0;
+		      obsexp = 0;
+		      winlen = 0;
+		      if (c.window_offset == c.window_size) {
+			// Move on
+			start = pos + 1;
+			endOfWindow = true;
+		      } else {
+			// Rewind
+			for(typename TChrIntervals::iterator sit = citv.begin(); ((sit != citv.end()) && (!endOfWindow)); ++sit) {
+			  if ((sit->first < sit->second) && (sit->second <= hdr->target_len[refIndex])) {
+			    if (start >= sit->second) continue;
+			    for(uint32_t k = sit->first; ((k < sit->second) && (!endOfWindow)); ++k) {
+			      if (k < start) continue;
+			      if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) {
+				++winlen;
+				if (winlen == c.window_offset) {
+				  start = k + 1;
+				  winlen = 0;
+				  endOfWindow = true;
+				}
+			      }
+			    }
+			  }
+			}
+		      }
+		    }
+		  }
+		}
+	      }
+	    }
+	  }
+	} else {
+	  // Fixed Window Length
+	  for(typename TChrIntervals::iterator it = bedRegions[refIndex].begin(); it != bedRegions[refIndex].end(); ++it) {
+	    if ((it->first < it->second) && (it->second <= hdr->target_len[refIndex])) {
+	      double covsum = 0;
+	      double expcov = 0;
+	      double obsexp = 0;
+	      uint32_t winlen = 0;
+	      for(uint32_t pos = it->first; pos < it->second; ++pos) {
+		if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+		  covsum += cov[pos];
+		  obsexp += gcbias[gcContent[pos]].obsexp;
+		  expcov += gcbias[gcContent[pos]].coverage;
+		  ++winlen;
+		}
+	      }
+	      if (winlen >= c.fracWindow * (it->second - it->first)) {
+		obsexp /= (double) winlen;
+		double count = ((double) covsum / obsexp ) * (double) (it->second - it->first) / (double) winlen;
+		double cn = c.ploidy * covsum / expcov;
+		dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+	      } else {
+		dataOut << std::string(hdr->target_name[refIndex]) << "\t" << it->first << "\t" << it->second << "\tNA\tNA\tNA" << std::endl;
+	      }
+	    }
+	  }
+	}
+      } else {
+	// Genome-wide
+	if (c.adaptive) {
+	  double covsum = 0;
+	  double expcov = 0;
+	  double obsexp = 0;
+	  uint32_t winlen = 0;
+	  uint32_t start = 0;
+	  uint32_t pos = 0;
+	  while(pos < hdr->target_len[refIndex]) {
+	    if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+	      covsum += cov[pos];
+	      obsexp += gcbias[gcContent[pos]].obsexp;
+	      expcov += gcbias[gcContent[pos]].coverage;
+	      ++winlen;
+	      if (winlen == c.window_size) {
+		obsexp /= (double) winlen;
+		double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		double cn = c.ploidy * covsum / expcov;
+		dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (pos + 1) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+		// reset
+		covsum = 0;
+		expcov = 0;
+		obsexp = 0;
+		winlen = 0;
+		if (c.window_offset == c.window_size) {
+		  // Move on
+		  start = pos + 1;
+		} else {
+		  // Rewind
+		  for(uint32_t k = start; k < hdr->target_len[refIndex]; ++k) {
+		    if ((gcContent[k] > gcbound.first) && (gcContent[k] < gcbound.second) && (uniqContent[k] >= c.fragmentUnique * c.meanisize)) {
+		      ++winlen;
+		      if (winlen == c.window_offset) {
+			start = k + 1;
+			pos = k;
+			winlen = 0;
+			break;
+		      }
+		    }
+		  }
+		}
+	      }
+	    }
+	    ++pos;
+	  }
+	} else {
+	  // Fixed windows (genomic tiling)
+	  for(uint32_t start = 0; start < hdr->target_len[refIndex]; start = start + c.window_offset) {
+	    if (start + c.window_size < hdr->target_len[refIndex]) {
+	      double covsum = 0;
+	      double expcov = 0;
+	      double obsexp = 0;
+	      uint32_t winlen = 0;
+	      for(uint32_t pos = start; pos < start + c.window_size; ++pos) {
+		if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+		  covsum += cov[pos];
+		  obsexp += gcbias[gcContent[pos]].obsexp;
+		  expcov += gcbias[gcContent[pos]].coverage;
+		  ++winlen;
+		}
+	      }
+	      if (winlen >= c.fracWindow * c.window_size) {
+		obsexp /= (double) winlen;
+		double count = ((double) covsum / obsexp ) * (double) c.window_size / (double) winlen;
+		double cn = c.ploidy * covsum / expcov;
+		dataOut << std::string(hdr->target_name[refIndex]) << "\t" << start << "\t" << (start + c.window_size) << "\t" << winlen << "\t" << count << "\t" << cn << std::endl;
+	      }
+	    }
+	  }
+	}
+
+	// Panel output?
+	if (c.hasPanelFile) {
+	  // cnv tiling windows
+	  uint32_t smallestWin = 300;
+	  uint32_t largestWin = 40000;
+	  uint32_t numTiling = 0;
+	  {
+	    uint32_t winbound = smallestWin;
+	    while (winbound < largestWin) {
+	      ++numTiling;
+	      winbound *= 2;
+	    }
+	  }
+	  std::vector<int32_t> cnvec(numTiling, -1);
+	  panelOut << "#" << std::string(hdr->target_name[refIndex]) << "," << hdr->target_len[refIndex] << "," << smallestWin << "," << largestWin << "," << c.sampleName << std::endl;
+	  for(uint32_t start = 0; start < hdr->target_len[refIndex]; start = start + 50) {
+	    if (start + smallestWin < hdr->target_len[refIndex]) {
+	      double covsum = 0;
+	      double expcov = 0;
+	      uint32_t winlen = 0;
+	      uint32_t winbound = smallestWin;
+	      uint32_t tilingPos = 0;
+	      std::fill(cnvec.begin(), cnvec.end(), -1);
+	      bool validCN = false;
+	      for(uint32_t pos = start; ((pos < start + largestWin) && (pos < hdr->target_len[refIndex])); ++pos) {
+		if ((gcContent[pos] > gcbound.first) && (gcContent[pos] < gcbound.second) && (uniqContent[pos] >= c.fragmentUnique * c.meanisize)) {
+		  covsum += cov[pos];
+		  expcov += gcbias[gcContent[pos]].coverage;
+		  ++winlen;
+		}
+		// Multiple of window size?
+		if ((pos - start) == winbound) {
+		  if (winlen >= c.fracWindow * (pos - start)) {
+		    cnvec[tilingPos] = (int32_t) boost::math::round(c.ploidy * covsum / expcov * 100.0);
+		    validCN = true;
+		  }
+		  winbound *= 2;
+		  ++tilingPos;
+		}
+	      }
+	      if (validCN) {
+		for(uint32_t i = 0; i < numTiling; ++i) panelOut << cnvec[i] << ',';
+		panelOut << std::endl;
+	      } else panelOut << "-2," << std::endl;
+	    }
+	  }
+	}
+      }
+    }
+
+    // clean-up
+    fai_destroy(faiRef);
+    fai_destroy(faiMap);
+    bam_hdr_destroy(hdr);
+    hts_idx_destroy(idx);
+    sam_close(samfile);
+    dataOut.pop();
+    dataOut.pop();
+    if (c.hasPanelFile) {
+      panelOut.pop();
+      panelOut.pop();
+    }
+    
+    return 0;
+  }
+
+  
+  int coral(int argc, char **argv) {
+    CountDNAConfig c;
+
+    // Parameter
+    boost::program_options::options_description generic("Generic options");
+    generic.add_options()
+      ("help,?", "show help message")
+      ("genome,g", boost::program_options::value<boost::filesystem::path>(&c.genome), "genome file")
+      ("quality,q", boost::program_options::value<uint16_t>(&c.minQual)->default_value(10), "min. mapping quality")
+      ("mappability,m", boost::program_options::value<boost::filesystem::path>(&c.mapFile), "input mappability map")
+      ("ploidy,y", boost::program_options::value<uint16_t>(&c.ploidy)->default_value(2), "baseline ploidy")
+      ("fragment,e", boost::program_options::value<float>(&c.fragmentUnique)->default_value(0.97), "min. fragment uniqueness [0,1]")
+      ("statsfile,s", boost::program_options::value<boost::filesystem::path>(&c.statsFile), "gzipped stats output file (optional)")
+      ("outfile,o", boost::program_options::value<boost::filesystem::path>(&c.outfile)->default_value("out.cov.gz"), "output file")
+      ("adaptive-windowing,a", "use mappable bases for window size")
+      ;
+
+    boost::program_options::options_description window("Window options");
+    window.add_options()
+      ("window-size,i", boost::program_options::value<uint32_t>(&c.window_size)->default_value(10000), "window size")
+      ("window-offset,j", boost::program_options::value<uint32_t>(&c.window_offset)->default_value(10000), "window offset")
+      ("bed-intervals,b", boost::program_options::value<boost::filesystem::path>(&c.bedFile), "input BED file")
+      ("fraction-window,k", boost::program_options::value<float>(&c.fracWindow)->default_value(0.25), "min. callable window fraction [0,1]")
+      ;
+
+    boost::program_options::options_description gcopt("GC options");
+    gcopt.add_options()
+      ("scan-window,w", boost::program_options::value<uint32_t>(&c.scanWindow)->default_value(10000), "scanning window size")
+      ("fraction-unique,f", boost::program_options::value<float>(&c.uniqueToTotalCovRatio)->default_value(0.8), "uniqueness filter for scan windows [0,1]")
+      ("scan-regions,r", boost::program_options::value<boost::filesystem::path>(&c.scanFile), "scanning regions in BED format")
+      ("mad-cutoff,d", boost::program_options::value<uint16_t>(&c.mad)->default_value(3), "median + 3 * mad count cutoff")
+      ("percentile,p", boost::program_options::value<float>(&c.exclgc)->default_value(0.0005), "excl. extreme GC fraction")
+      ("no-window-selection,n", "no scan window selection")
+      ;
+    
+    boost::program_options::options_description hidden("Hidden options");
+    hidden.add_options()
+      ("input-file", boost::program_options::value<boost::filesystem::path>(&c.bamFile), "input bam file")
+      ("panelfile,z", boost::program_options::value<boost::filesystem::path>(&c.panelfile), "output panel file")
+      ;
+
+    boost::program_options::positional_options_description pos_args;
+    pos_args.add("input-file", -1);
+
+    // Set the visibility
+    boost::program_options::options_description cmdline_options;
+    cmdline_options.add(generic).add(window).add(gcopt).add(hidden);
+    boost::program_options::options_description visible_options;
+    visible_options.add(generic).add(window).add(gcopt);
+
+    // Parse command-line
+    boost::program_options::variables_map vm;
+    boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(cmdline_options).positional(pos_args).run(), vm);
+    boost::program_options::notify(vm);
+
+    // Check command line arguments
+    if ((vm.count("help")) || (!vm.count("input-file")) || (!vm.count("genome")) || (!vm.count("mappability"))) {
+      std::cout << std::endl;
+      std::cout << "Usage: delly " << argv[0] << " [OPTIONS] -g <genome.fa> -m <genome.map> <aligned.bam>" << std::endl;
+      std::cout << visible_options << "\n";
+      return 1;
+    }
+
+    // Show cmd
+    boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+    std::cout << '[' << boost::posix_time::to_simple_string(now) << "] ";
+    std::cout << "delly ";
+    for(int i=0; i<argc; ++i) { std::cout << argv[i] << ' '; }
+    std::cout << std::endl;
+
+    // Stats file
+    if (vm.count("statsfile")) c.hasStatsFile = true;
+    else c.hasStatsFile = false;
+
+    // BED intervals
+    if (vm.count("bed-intervals")) c.hasBedFile = true;
+    else c.hasBedFile = false;
+
+    // Scan regions
+    if (vm.count("scan-regions")) c.hasScanFile = true;
+    else c.hasScanFile = false;
+
+    // Scan window selection
+    if (vm.count("no-window-selection")) c.noScanWindowSelection = true;
+    else c.noScanWindowSelection = false;
+
+    // Adaptive windowing
+    if (vm.count("adaptive-windowing")) c.adaptive = true;
+    else c.adaptive = false;
+
+    // CNV mode
+    if (vm.count("panelfile")) c.hasPanelFile = true;
+    else c.hasPanelFile = false;
+    
+    // Check window size
+    if (c.window_offset > c.window_size) c.window_offset = c.window_size;
+    if (c.window_size == 0) c.window_size = 1;
+    if (c.window_offset == 0) c.window_offset = 1;
+
+    // Check bam file
+    LibraryInfo li;
+    if (!(boost::filesystem::exists(c.bamFile) && boost::filesystem::is_regular_file(c.bamFile) && boost::filesystem::file_size(c.bamFile))) {
+      std::cerr << "Alignment file is missing: " << c.bamFile.string() << std::endl;
+      return 1;
+    } else {
+      // Get scan regions
+      typedef boost::icl::interval_set<uint32_t> TChrIntervals;
+      typedef typename TChrIntervals::interval_type TIVal;
+      typedef std::vector<TChrIntervals> TRegionsGenome;
+      TRegionsGenome scanRegions;
+
+      // Open BAM file
+      samFile* samfile = sam_open(c.bamFile.string().c_str(), "r");
+      if (samfile == NULL) {
+	std::cerr << "Fail to open file " << c.bamFile.string() << std::endl;
+	return 1;
+      }
+      hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str());
+      if (idx == NULL) {
+	if (bam_index_build(c.bamFile.string().c_str(), 0) != 0) {
+	  std::cerr << "Fail to open index for " << c.bamFile.string() << std::endl;
+	  return 1;
+	}
+      }
+      bam_hdr_t* hdr = sam_hdr_read(samfile);
+      if (hdr == NULL) {
+	std::cerr << "Fail to open header for " << c.bamFile.string() << std::endl;
+	return 1;
+      }
+      c.nchr = hdr->n_targets;
+      c.minChrLen = setMinChrLen(hdr, 0.95);
+      std::string sampleName = "unknown";
+      getSMTag(std::string(hdr->text), c.bamFile.stem().string(), sampleName);
+      c.sampleName = sampleName;
+
+      // Check matching chromosome names
+      faidx_t* faiRef = fai_load(c.genome.string().c_str());
+      faidx_t* faiMap = fai_load(c.mapFile.string().c_str());
+      uint32_t mapFound = 0;
+      uint32_t refFound = 0;
+      for(int32_t refIndex=0; refIndex < hdr->n_targets; ++refIndex) {
+	std::string tname(hdr->target_name[refIndex]);
+	if (faidx_has_seq(faiMap, tname.c_str())) ++mapFound;
+	if (faidx_has_seq(faiRef, tname.c_str())) ++refFound;
+	else {
+	  std::cerr << "Warning: BAM chromosome " << tname << " not present in reference genome!" << std::endl;
+	}
+      }
+      fai_destroy(faiRef);
+      fai_destroy(faiMap);
+      if (!mapFound) {
+	std::cerr << "Mappability map chromosome naming disagrees with BAM file!" << std::endl;
+	return 1;
+      }
+      if (!refFound) {
+	std::cerr << "Reference genome chromosome naming disagrees with BAM file!" << std::endl;
+	return 1;
+      }
+
+      // Estimate library params
+      if (c.hasScanFile) {
+	if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) {
+	  std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl;
+	  return 1;
+	}
+      } else {
+	scanRegions.resize(hdr->n_targets);
+	for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) {
+	  scanRegions[refIndex].insert(TIVal::right_open(0, hdr->target_len[refIndex]));
+	}
+      }
+      typedef std::vector<LibraryInfo> TSampleLibrary;
+      TSampleLibrary sampleLib(1, LibraryInfo());
+      CountDNAConfigLib dellyConf;
+      dellyConf.genome = c.genome;
+      dellyConf.files.push_back(c.bamFile);
+      dellyConf.madCutoff = 9;
+      dellyConf.madNormalCutoff = c.mad;
+      getLibraryParams(dellyConf, scanRegions, sampleLib);
+      li = sampleLib[0];
+      if (!li.median) {
+	li.median = 250;
+	li.mad = 15;
+	li.minNormalISize = 0;
+	li.maxNormalISize = 400;
+      }
+      c.meanisize = ((int32_t) (li.median / 2)) * 2 + 1;
+      
+      // Clean-up
+      bam_hdr_destroy(hdr);
+      hts_idx_destroy(idx);
+      sam_close(samfile);
+    }
+
+    // GC bias estimation
+    typedef std::pair<uint32_t, uint32_t> TGCBound;
+    TGCBound gcbound;
+    std::vector<GcBias> gcbias(c.meanisize + 1, GcBias());
+    {
+      // Scan genomic windows
+      typedef std::vector<ScanWindow> TWindowCounts;
+      typedef std::vector<TWindowCounts> TGenomicWindowCounts;
+      TGenomicWindowCounts scanCounts(c.nchr, TWindowCounts());
+      scan(c, li, scanCounts);
+    
+      // Select stable windows
+      selectWindows(c, scanCounts);
+
+      // Estimate GC bias
+      gcBias(c, scanCounts, li, gcbias, gcbound);
+
+      // Statistics output
+      if (c.hasStatsFile) {
+	// Open stats file
+	boost::iostreams::filtering_ostream statsOut;
+	statsOut.push(boost::iostreams::gzip_compressor());
+	statsOut.push(boost::iostreams::file_sink(c.statsFile.string().c_str(), std::ios_base::out | std::ios_base::binary));
+	
+	// Library Info
+	statsOut << "LP\t" << li.rs << ',' << li.median << ',' << li.mad << ',' << li.minNormalISize << ',' << li.maxNormalISize << std::endl;
+	
+	// Scan window summry
+	samFile* samfile = sam_open(c.bamFile.string().c_str(), "r");
+	bam_hdr_t* hdr = sam_hdr_read(samfile);
+	statsOut << "SW\tchrom\tstart\tend\tselected\tcoverage\tuniqcov" <<  std::endl;
+	for(uint32_t refIndex = 0; refIndex < (uint32_t) hdr->n_targets; ++refIndex) {
+	  for(uint32_t i = 0; i < scanCounts[refIndex].size(); ++i) {
+	    statsOut << "SW\t" <<  hdr->target_name[refIndex] << '\t' << scanCounts[refIndex][i].start << '\t' << scanCounts[refIndex][i].end << '\t' << scanCounts[refIndex][i].select << '\t' << scanCounts[refIndex][i].cov << '\t' << scanCounts[refIndex][i].uniqcov << std::endl;
+	  }
+	}
+	bam_hdr_destroy(hdr);
+	sam_close(samfile);
+	
+	// GC bias summary
+	statsOut << "GC\tgcsum\tsample\treference\tpercentileSample\tpercentileReference\tfractionSample\tfractionReference\tobsexp\tmeancoverage" << std::endl;
+	for(uint32_t i = 0; i < gcbias.size(); ++i) statsOut << "GC\t" << i << "\t" << gcbias[i].sample << "\t" << gcbias[i].reference << "\t" << gcbias[i].percentileSample << "\t" << gcbias[i].percentileReference << "\t" << gcbias[i].fractionSample << "\t" << gcbias[i].fractionReference << "\t" << gcbias[i].obsexp << "\t" << gcbias[i].coverage << std::endl;
+	statsOut << "BoundsGC\t" << gcbound.first << "," << gcbound.second << std::endl;
+	statsOut.pop();
+	statsOut.pop();
+      }
+    }
+      
+    // Count reads
+    if (bamCount(c, li, gcbias, gcbound)) {
+      std::cerr << "Read counting error!" << std::endl;
+      return 1;
+    }
+
+    // Done
+    now = boost::posix_time::second_clock::local_time();
+    std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Done." << std::endl;
+
+    return 0;
+  }
+
+  
+}
+
+#endif


=====================================
src/coverage.h
=====================================
@@ -144,7 +144,7 @@ namespace torali {
   
   template<typename TConfig, typename TSVs, typename TBreakProbes, typename TGenomicBpRegion>
   inline void
-  _generateProbes(TConfig const& c, bam_hdr_t* hdr, TSVs& svs, TBreakProbes& refProbeArr, TBreakProbes& consProbeArr, TGenomicBpRegion& bpRegion) {
+    _generateProbes(TConfig const& c, bam_hdr_t* hdr, TSVs& svs, TBreakProbes& refProbeArr, TBreakProbes& consProbeArr, TGenomicBpRegion& bpRegion, std::vector<bool>& svOnChr) {
     typedef typename TBreakProbes::value_type TProbes;
 
     // Preprocess REF and ALT
@@ -161,7 +161,8 @@ namespace torali {
       // Iterate all structural variants
       for(typename TSVs::iterator itSV = svs.begin(); itSV != svs.end(); ++itSV) {
 	if ((itSV->chr != refIndex) && (itSV->chr2 != refIndex)) continue;
-
+	svOnChr[refIndex] = true;
+	
 	// Lazy loading of reference sequence
 	if (seq == NULL) {
 	  int32_t seqlen = -1;
@@ -291,9 +292,10 @@ namespace torali {
     typedef std::vector<BpRegion> TBpRegion;
     typedef std::vector<TBpRegion> TGenomicBpRegion;
     TGenomicBpRegion bpRegion(hdr[0]->n_targets, TBpRegion());
-
+    std::vector<bool> svOnChr(hdr[0]->n_targets, false);
+    
     // Generate probes
-    _generateProbes(c, hdr[0], svs, refProbeArr, consProbeArr, bpRegion);
+    _generateProbes(c, hdr[0], svs, refProbeArr, consProbeArr, bpRegion, svOnChr);
   
     // Debug
     //for(uint32_t k = 0; k < 2; ++k) {
@@ -340,7 +342,7 @@ namespace torali {
 	++show_progress;
       
 	// Any SV breakpoints on this chromosome?
-	if (bpRegion[refIndex].empty()) continue;
+	if (!svOnChr[refIndex]) continue;
 
 	// Check we have mapped reads on this chromosome
 	bool nodata = true;
@@ -441,6 +443,7 @@ namespace torali {
 	      // Fetch all relevant SVs
 	      typename TBpRegion::iterator itBp = std::lower_bound(bpRegion[refIndex].begin(), bpRegion[refIndex].end(), BpRegion(rbegin), SortBp<BpRegion>());
 	      for(; ((itBp != bpRegion[refIndex].end()) && (rec->core.pos + rec->core.l_qseq >= itBp->bppos)); ++itBp) {
+		if ((countMap[file_c][itBp->id].ref.size() + countMap[file_c][itBp->id].alt.size()) >= c.maxGenoReadCount) continue;
 		// Read spans breakpoint?
 		if ((hasSoftClip) || ((!hasClip) && (rec->core.pos + c.minimumFlankSize + itBp->homLeft <= itBp->bppos) &&  (rec->core.pos + rec->core.l_qseq >= itBp->bppos + c.minimumFlankSize + itBp->homRight))) {
 		  std::string consProbe = consProbeArr[itBp->bpPoint][itBp->id];
@@ -535,9 +538,9 @@ namespace torali {
 	      }
 	    }
 	  }
-	
+
 	  // Read-count and spanning annotation
-	  if ((!(rec->core.flag & BAM_FPAIRED)) || (bpRegion[rec->core.mtid].empty())) continue;
+	  if ((!(rec->core.flag & BAM_FPAIRED)) || (!svOnChr[rec->core.mtid])) continue;
 
 	  // Clean-up the read store for identical alignment positions
 	  if (rec->core.pos > lastAlignedPos) {
@@ -574,7 +577,7 @@ namespace torali {
 	      qualitiestra[hv] = 0;
 	      cliptra[hv] = false;
 	    }
-	    
+
 	    // Pair quality
 	    if (pairQuality < c.minGenoQual) continue; // Low quality pair
 	    


=====================================
src/delly.cpp
=====================================
@@ -18,6 +18,7 @@
 #include "filter.h"
 #include "merge.h"
 #include "tegua.h"
+#include "coral.h"
 
 using namespace torali;
 
@@ -26,13 +27,17 @@ inline void
 displayUsage() {
   std::cout << "Usage: delly <command> <arguments>" << std::endl;
   std::cout << std::endl;
-  std::cout << "Commands:" << std::endl;
-  std::cout << std::endl;
+  std::cout << "Short-read commands:" << std::endl;
   std::cout << "    call         discover and genotype structural variants" << std::endl;
-  std::cout << "    lr           long-read SV discovery" << std::endl;
   std::cout << "    merge        merge structural variants across VCF/BCF files and within a single VCF/BCF file" << std::endl;
   std::cout << "    filter       filter somatic or germline structural variants" << std::endl;
   std::cout << std::endl;
+  std::cout << "Long-read commands:" << std::endl;
+  std::cout << "    lr           long-read SV discovery" << std::endl;
+  std::cout << std::endl;
+  std::cout << "Read-depth commands:" << std::endl;
+  std::cout << "    rd           read-depth normalization" << std::endl;
+  std::cout << std::endl;
   std::cout << std::endl;
 }
 
@@ -68,6 +73,9 @@ int main(int argc, char **argv) {
     else if ((std::string(argv[1]) == "lr")) {
       return tegua(argc-1,argv+1);
     }
+    else if ((std::string(argv[1]) == "rd")) {
+      return coral(argc-1,argv+1);
+    }
     else if ((std::string(argv[1]) == "filter")) {
       return filter(argc-1,argv+1);
     }


=====================================
src/delly.h
=====================================
@@ -54,6 +54,7 @@ namespace torali
     uint16_t minTraQual;
     uint16_t minGenoQual;
     uint16_t madCutoff;
+    uint16_t madNormalCutoff;
     int32_t nchr;
     int32_t minimumFlankSize;
     int32_t indelsize;
@@ -61,6 +62,8 @@ namespace torali
     uint32_t minRefSep;
     uint32_t maxReadSep;
     uint32_t minClip;
+    uint32_t maxGenoReadCount;
+    uint32_t minCliqueSize;
     float flankQuality;
     bool hasExcludeFile;
     bool hasVcfFile;
@@ -196,6 +199,7 @@ namespace torali
   int delly(int argc, char **argv) {
     Config c;
     c.isHaplotagged = false;
+    c.madNormalCutoff = 5;
 
     // Define generic options
     std::string svtype;
@@ -214,6 +218,7 @@ namespace torali
       ("qual-tra,r", boost::program_options::value<uint16_t>(&c.minTraQual)->default_value(20), "min. PE quality for translocation")
       ("mad-cutoff,s", boost::program_options::value<uint16_t>(&c.madCutoff)->default_value(9), "insert size cutoff, median+s*MAD (deletions only)")
       ("minclip,c", boost::program_options::value<uint32_t>(&c.minClip)->default_value(25), "min. clipping length")
+      ("min-clique-size,z", boost::program_options::value<uint32_t>(&c.minCliqueSize)->default_value(2), "min. PE/SR clique size")
       ("minrefsep,m", boost::program_options::value<uint32_t>(&c.minRefSep)->default_value(25), "min. reference separation")
       ("maxreadsep,n", boost::program_options::value<uint32_t>(&c.maxReadSep)->default_value(40), "max. read separation")
       ;
@@ -230,6 +235,7 @@ namespace torali
     hidden.add_options()
       ("input-file", boost::program_options::value< std::vector<boost::filesystem::path> >(&c.files), "input file")
       ("pruning,j", boost::program_options::value<uint32_t>(&c.graphPruning)->default_value(1000), "PE graph pruning cutoff")
+      ("max-geno-count,a", boost::program_options::value<uint32_t>(&c.maxGenoReadCount)->default_value(250), "max. number of reads aligned for SR genotyping")
       ;
     
     boost::program_options::positional_options_description pos_args;
@@ -259,6 +265,9 @@ namespace torali
     // Dump PE and SR support?
     if (vm.count("dump")) c.hasDumpFile = true;
     else c.hasDumpFile = false;
+
+    // Clique size
+    if (c.minCliqueSize < 2) c.minCliqueSize = 2;
     
     // Check quality cuts
     if (c.minMapQual > c.minTraQual) c.minTraQual = c.minMapQual;


=====================================
src/gcbias.h
=====================================
@@ -0,0 +1,343 @@
+#ifndef GCBIAS_H
+#define GCBIAS_H
+
+#include <boost/unordered_map.hpp>
+#include <boost/program_options/cmdline.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/date_time/gregorian/gregorian.hpp>
+#include <boost/tuple/tuple.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/progress.hpp>
+#include <boost/dynamic_bitset.hpp>
+
+#include <htslib/faidx.h>
+#include <htslib/sam.h>
+#include <htslib/vcf.h>
+
+#include "scan.h"
+#include "util.h"
+
+namespace torali
+{
+  struct GcBias {
+    int32_t sample;
+    int32_t reference;
+    double fractionSample;
+    double fractionReference;
+    double percentileSample;
+    double percentileReference;
+    double obsexp;
+    double coverage;
+
+    GcBias() : sample(0), reference(0), fractionSample(0), fractionReference(0), percentileSample(0), percentileReference(0), obsexp(0), coverage(0) {}
+  };
+
+  template<typename TConfig>
+  inline std::pair<uint32_t, uint32_t>
+  gcBound(TConfig const& c, std::vector<GcBias>& gcbias) {
+    uint32_t lowerBound = 0;
+    uint32_t upperBound = gcbias.size();
+    for(uint32_t i = 0; i < gcbias.size(); ++i) {
+      if ((gcbias[i].percentileSample < c.exclgc) || (gcbias[i].percentileReference < c.exclgc)) lowerBound = i;
+      if ((gcbias[i].percentileSample + c.exclgc > 1) || (gcbias[i].percentileReference + c.exclgc > 1)) {
+	if (i < upperBound) upperBound = i;
+      }
+    }
+    if (lowerBound >= upperBound) upperBound = lowerBound + 1;
+    /*
+    // Adjust total
+    uint64_t totalSampleCount = 0;
+    uint64_t totalReferenceCount = 0;
+    for(uint32_t i = lowerBound + 1; i < upperBound; ++i) {
+      totalSampleCount += gcbias[i].sample;
+      totalReferenceCount += gcbias[i].reference;
+    }    
+    // Re-estimate observed/expected
+    for(uint32_t i = lowerBound + 1; i < upperBound; ++i) {
+      gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount;
+      gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount;
+      gcbias[i].obsexp = 1;
+      if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference;
+    }
+    */
+    
+    return std::make_pair(lowerBound, upperBound);
+  }
+
+
+  inline double
+  getPercentIdentity(bam1_t const* rec, char const* seq) {
+    // Sequence
+    std::string sequence;
+    sequence.resize(rec->core.l_qseq);
+    uint8_t* seqptr = bam_get_seq(rec);
+    for (int i = 0; i < rec->core.l_qseq; ++i) sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)];
+
+    // Reference slice
+    std::string refslice = boost::to_upper_copy(std::string(seq + rec->core.pos, seq + lastAlignedPosition(rec)));
+	      
+    // Percent identity
+    uint32_t rp = 0; // reference pointer
+    uint32_t sp = 0; // sequence pointer
+    uint32_t* cigar = bam_get_cigar(rec);
+    int32_t matchCount = 0;
+    int32_t mismatchCount = 0;
+    for (std::size_t i = 0; i < rec->core.n_cigar; ++i) {
+      if ((bam_cigar_op(cigar[i]) == BAM_CMATCH) || (bam_cigar_op(cigar[i]) == BAM_CEQUAL) || (bam_cigar_op(cigar[i]) == BAM_CDIFF)) {
+	// match or mismatch
+	for(std::size_t k = 0; k<bam_cigar_oplen(cigar[i]);++k) {
+	  if (sequence[sp] == refslice[rp]) ++matchCount;
+	  else ++mismatchCount;
+	  ++sp;
+	  ++rp;
+	}
+      } else if (bam_cigar_op(cigar[i]) == BAM_CDEL) {
+	mismatchCount += bam_cigar_oplen(cigar[i]);
+	rp += bam_cigar_oplen(cigar[i]);
+      } else if (bam_cigar_op(cigar[i]) == BAM_CINS) {
+	mismatchCount += bam_cigar_oplen(cigar[i]);
+	sp += bam_cigar_oplen(cigar[i]);
+      } else if (bam_cigar_op(cigar[i]) == BAM_CSOFT_CLIP) {
+	sp += bam_cigar_oplen(cigar[i]);
+      } else if(bam_cigar_op(cigar[i]) == BAM_CHARD_CLIP) {
+      } else if (bam_cigar_op(cigar[i]) == BAM_CREF_SKIP) {
+	mismatchCount += bam_cigar_oplen(cigar[i]);
+	rp += bam_cigar_oplen(cigar[i]);
+      } else {
+	std::cerr << "Unknown Cigar options" << std::endl;
+	return 1;
+      }
+    }
+    double percid = 0;
+    if (matchCount + mismatchCount > 0) percid = (double) matchCount / (double) (matchCount + mismatchCount);
+    return percid;
+  }
+
+
+  
+  template<typename TConfig, typename TGCBound>
+  inline void
+  gcBias(TConfig const& c, std::vector< std::vector<ScanWindow> > const& scanCounts, LibraryInfo const& li, std::vector<GcBias>& gcbias, TGCBound& gcbound) {
+    // Load bam file
+    samFile* samfile = sam_open(c.bamFile.string().c_str(), "r");
+    hts_set_fai_filename(samfile, c.genome.string().c_str());
+    hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str());
+    bam_hdr_t* hdr = sam_hdr_read(samfile);
+
+    // Parse bam (contig by contig)
+    boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+    std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Estimate GC bias" << std::endl;
+    boost::progress_display show_progress( hdr->n_targets );
+
+    faidx_t* faiMap = fai_load(c.mapFile.string().c_str());
+    faidx_t* faiRef = fai_load(c.genome.string().c_str());
+    for (int refIndex = 0; refIndex < hdr->n_targets; ++refIndex) {
+      ++show_progress;
+      if (scanCounts[refIndex].empty()) continue;
+
+      // Bin map
+      std::vector<uint16_t> binMap;
+      if (c.hasScanFile) {
+	// Fill bin map
+	binMap.resize(hdr->target_len[refIndex], LAST_BIN);
+	for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) {
+	  for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin;
+	}
+      }
+      
+      // Check presence in mappability map
+      std::string tname(hdr->target_name[refIndex]);
+      int32_t seqlen = faidx_seq_len(faiMap, tname.c_str());
+      if (seqlen == - 1) continue;
+      else seqlen = -1;
+      char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen);
+
+      // Check presence in reference
+      seqlen = faidx_seq_len(faiRef, tname.c_str());
+      if (seqlen == - 1) continue;
+      else seqlen = -1;
+      char* ref = faidx_fetch_seq(faiRef, tname.c_str(), 0, faidx_seq_len(faiRef, tname.c_str()), &seqlen);
+
+      // Get GC and Mappability
+      std::vector<uint16_t> uniqContent(hdr->target_len[refIndex], 0);
+      std::vector<uint16_t> gcContent(hdr->target_len[refIndex], 0);
+      {
+	// Mappability map
+	typedef boost::dynamic_bitset<> TBitSet;
+	TBitSet uniq(hdr->target_len[refIndex], false);
+	for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	  if (seq[i] == 'C') uniq[i] = true;
+	}
+
+	// GC map
+	typedef boost::dynamic_bitset<> TBitSet;
+	TBitSet gcref(hdr->target_len[refIndex], false);
+	for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	  if ((ref[i] == 'c') || (ref[i] == 'C') || (ref[i] == 'g') || (ref[i] == 'G')) gcref[i] = 1;
+	}
+
+	// Sum across fragments
+	int32_t halfwin = (int32_t) (c.meanisize / 2);
+	int32_t usum = 0;
+	int32_t gcsum = 0;
+	for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) {
+	  if (pos == halfwin) {
+	    for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) {
+	      usum += uniq[i];
+	      gcsum += gcref[i];
+	    }
+	  } else {
+	    usum -= uniq[pos - halfwin - 1];
+	    gcsum -= gcref[pos - halfwin - 1];
+	    usum += uniq[pos + halfwin];
+	    gcsum += gcref[pos + halfwin];
+	  }
+	  gcContent[pos] = gcsum;
+	  uniqContent[pos] = usum;
+	}
+      }
+
+      // Coverage track
+      typedef uint16_t TCount;
+      uint32_t maxCoverage = std::numeric_limits<TCount>::max();
+      typedef std::vector<TCount> TCoverage;
+      TCoverage cov(hdr->target_len[refIndex], 0);
+      
+      // Mate map
+      typedef boost::unordered_map<std::size_t, bool> TMateMap;
+      TMateMap mateMap;
+      
+      // Parse BAM
+      hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]);
+      bam1_t* rec = bam_init1();
+      int32_t lastAlignedPos = 0;
+      std::set<std::size_t> lastAlignedPosReads;
+      while (sam_itr_next(samfile, iter, rec) >= 0) {
+	if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue;
+	if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue;
+	if (rec->core.qual < c.minQual) continue;
+
+	int32_t midPoint = rec->core.pos + halfAlignmentLength(rec);
+	if (rec->core.flag & BAM_FPAIRED) {
+	  // Clean-up the read store for identical alignment positions
+	  if (rec->core.pos > lastAlignedPos) {
+	    lastAlignedPosReads.clear();
+	    lastAlignedPos = rec->core.pos;
+	  }
+
+	  // Process pair
+	  if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) {
+	    // First read
+	    lastAlignedPosReads.insert(hash_string(bam_get_qname(rec)));
+	    std::size_t hv = hash_pair(rec);
+	    mateMap[hv]= true;
+	    continue;
+	  } else {
+	    // Second read
+	    std::size_t hv = hash_pair_mate(rec);
+	    if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded
+	    mateMap[hv] = false;
+	  }
+	
+	  // Insert size filter
+	  int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos;
+	  if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) {
+	    midPoint = rec->core.mpos + (int32_t) (isize/2);
+	  } else {
+	    if (rec->core.flag & BAM_FREVERSE) midPoint = rec->core.pos + alignmentLength(rec) - (c.meanisize / 2);
+	    else midPoint = rec->core.pos + (c.meanisize / 2);
+	  }
+	}
+
+	// Count fragment
+	if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex]) && (cov[midPoint] < maxCoverage - 1)) ++cov[midPoint];
+      }
+      bam_destroy1(rec);
+      hts_itr_destroy(iter);
+      if (seq != NULL) free(seq);
+      if (ref != NULL) free(ref);
+
+      // Summarize GC coverage for this chromosome
+      for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	if (uniqContent[i] >= c.fragmentUnique * c.meanisize) {
+	  // Valid bin?
+	  int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, i);
+	  if ((bin >= 0) && (scanCounts[refIndex][bin].select)) {
+	    ++gcbias[gcContent[i]].reference;
+	    gcbias[gcContent[i]].sample += cov[i];
+	    gcbias[gcContent[i]].coverage += cov[i];
+	  }
+	}
+      }
+    }
+    
+    // Normalize GC coverage
+    for(uint32_t i = 0; i < gcbias.size(); ++i) {
+      if (gcbias[i].reference) gcbias[i].coverage /= (double) gcbias[i].reference;
+      else gcbias[i].coverage = 0;
+    }
+
+    // Determine percentiles
+    uint64_t totalSampleCount = 0;
+    uint64_t totalReferenceCount = 0;
+    for(uint32_t i = 0; i < gcbias.size(); ++i) {
+      totalSampleCount += gcbias[i].sample;
+      totalReferenceCount += gcbias[i].reference;
+    }
+    uint64_t cumSample = 0;
+    uint64_t cumReference = 0;
+    for(uint32_t i = 0; i < gcbias.size(); ++i) {
+      cumSample += gcbias[i].sample;
+      cumReference += gcbias[i].reference;
+      gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount;
+      gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount;
+      gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount;
+      gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount;
+      gcbias[i].obsexp = 1;
+      if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference;
+    }
+
+    // Estimate correctable GC range
+    gcbound = gcBound(c, gcbias);
+
+    // Adjust correction to the callable range
+    totalSampleCount = 0;
+    totalReferenceCount = 0;
+    for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) {
+      totalSampleCount += gcbias[i].sample;
+      totalReferenceCount += gcbias[i].reference;
+    }
+    cumSample = 0;
+    cumReference = 0;
+    // Re-initialize
+    for(uint32_t i = 0; i < gcbias.size(); ++i) {
+      gcbias[i].fractionSample = 0;
+      gcbias[i].fractionReference = 0;
+      gcbias[i].percentileSample = 0;
+      gcbias[i].percentileReference = 0;
+      gcbias[i].obsexp = 1;
+    }
+    for(uint32_t i = gcbound.first + 1; i < gcbound.second; ++i) {
+      cumSample += gcbias[i].sample;
+      cumReference += gcbias[i].reference;
+      gcbias[i].fractionSample = (double) gcbias[i].sample / (double) totalSampleCount;
+      gcbias[i].fractionReference = (double) gcbias[i].reference / (double) totalReferenceCount;
+      gcbias[i].percentileSample = (double) cumSample / (double) totalSampleCount;
+      gcbias[i].percentileReference = (double) cumReference / (double) totalReferenceCount;
+      gcbias[i].obsexp = 1;
+      if (gcbias[i].fractionReference > 0) gcbias[i].obsexp = gcbias[i].fractionSample / gcbias[i].fractionReference;
+    }
+    
+    fai_destroy(faiRef);
+    fai_destroy(faiMap);
+    hts_idx_destroy(idx);
+    sam_close(samfile);
+    bam_hdr_destroy(hdr);
+  }
+
+}
+
+#endif


=====================================
src/genotype.h
=====================================
@@ -335,6 +335,9 @@ namespace torali
 	    // Genotype all SVs covered by this read
 	    for(typename TSVSeqHit::iterator git = genoMap.begin(); git != genoMap.end(); ++git) {
 	      int32_t svid = git->first;
+	      uint32_t maxGenoReadCount = 500;
+	      if ((jctMap[file_c][svid].ref.size() + jctMap[file_c][svid].alt.size()) >= maxGenoReadCount) continue;
+	      
 	      int32_t rpHit = git->second.first;
 	      int32_t spHit = git->second.second;
 
@@ -367,28 +370,15 @@ namespace torali
 	      }
 	    
 	      // Compute alignment to alternative haplotype
-	      TAlign alignAlt;
 	      DnaScore<int> simple(c.aliscore.match, c.aliscore.mismatch, c.aliscore.mismatch, c.aliscore.mismatch);
 	      AlignConfig<true, false> semiglobal;
-	      double scoreAlt = needle(gbp[svid].alt, subseq, alignAlt, semiglobal, simple);
+	      double scoreAlt = needleBanded(gbp[svid].alt, subseq, semiglobal, simple);
 	      scoreAlt /= (double) (c.flankQuality * gbp[svid].alt.size() * simple.match + (1.0 - c.flankQuality) * gbp[svid].alt.size() * simple.mismatch);
 	    
 	      // Compute alignment to reference haplotype
-	      TAlign alignRef;
-	      double scoreRef = needle(gbp[svid].ref, subseq, alignRef, semiglobal, simple);
+	      double scoreRef = needleBanded(gbp[svid].ref, subseq, semiglobal, simple);
 	      scoreRef /= (double) (c.flankQuality * gbp[svid].ref.size() * simple.match + (1.0 - c.flankQuality) * gbp[svid].ref.size() * simple.mismatch);
 
-	      // Debug alignment to REF and ALT
-	      //std::cerr << "svid:" << svid << ",seqbp:" << spHit << ",readlen:" << readlen << ",reverse:" << (int32_t) (rec->core.flag & BAM_FREVERSE) << ",alt:" << scoreAlt << ",ref:" << scoreRef << std::endl;
-	      //for(uint32_t i = 0; i< alignAlt.shape()[0]; ++i) {
-	      //for(uint32_t j = 0; j< alignAlt.shape()[1]; ++j) std::cerr << alignAlt[i][j];
-	      //std::cerr << std::endl;
-	      //}
-	      //for(uint32_t i = 0; i< alignRef.shape()[0]; ++i) {
-	      //for(uint32_t j = 0; j< alignRef.shape()[1]; ++j) std::cerr << alignRef[i][j];
-	      //std::cerr << std::endl;
-	      //}
-	      
 	      // Any confident alignment?
 	      if ((scoreRef > 1) || (scoreAlt > 1)) {
 		if (scoreRef > scoreAlt) {
@@ -398,7 +388,7 @@ namespace torali
 		    quality.resize(rec->core.l_qseq);
 		    uint8_t* qualptr = bam_get_qual(rec);
 		    for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i];
-		    uint32_t rq = _getAlignmentQual(alignRef, quality);
+		    uint32_t rq = scoreRef * 35;
 		    if (rq >= c.minGenoQual) {
 		      uint8_t* hpptr = bam_aux_get(rec, "HP");
 		      jctMap[file_c][svid].ref.push_back((uint8_t) std::min(rq, (uint32_t) rec->core.qual));
@@ -415,7 +405,7 @@ namespace torali
 		  quality.resize(rec->core.l_qseq);
 		  uint8_t* qualptr = bam_get_qual(rec);
 		  for (int i = 0; i < rec->core.l_qseq; ++i) quality[i] = qualptr[i];
-		  uint32_t aq = _getAlignmentQual(alignAlt, quality);
+		  uint32_t aq = scoreAlt * 35;
 		  if (aq >= c.minGenoQual) {
 		    uint8_t* hpptr = bam_aux_get(rec, "HP");
 		    if (c.hasDumpFile) {


=====================================
src/merge.h
=====================================
@@ -131,6 +131,7 @@ void _fillIntervalMap(MergeConfig const& c, TGenomeIntervals& iScore, TContigMap
 	uint32_t inslenVal = 0;
 	if (bcf_get_info_int32(hdr, rec, "INSLEN", &inslen, &ninslen) > 0) inslenVal = *inslen;
 	if ((inslenVal < c.minsize) || (inslenVal > c.maxsize)) continue;
+	svEnd = svStart + inslenVal; // To enable reciprocal overlap
       } else {
 	// Other intra-chr SV
 	if ((svEnd - svStart < c.minsize) || (svEnd - svStart > c.maxsize)) continue;
@@ -391,6 +392,7 @@ void _outputSelectedIntervals(MergeConfig& c, TGenomeIntervals const& iSelected,
 	if (bcf_get_info_int32(hdr[idx], rec[idx], "END", &svend, &nsvend) > 0) svEnd = *svend;
 	unsigned int inslenVal = 0;
 	if (bcf_get_info_int32(hdr[idx], rec[idx], "INSLEN", &inslen, &ninslen) > 0) inslenVal = *inslen;
+	if (recsvt == 4) svEnd = svStart + inslenVal; // To enable reciprocal overlap
 
 	// Parse INFO fields
 	if ((std::string(svt) == "BND") || ((std::string(svt) == "INS") && (inslenVal >= c.minsize) && (inslenVal <= c.maxsize)) || ((std::string(svt) != "BND") && (std::string(svt) != "INS") && (svEnd - svStart >= c.minsize) && (svEnd - svStart <= c.maxsize))) {


=====================================
src/modvcf.h
=====================================
@@ -651,8 +651,10 @@ vcfOutput(TConfig const& c, std::vector<TStructuralVariantRecord> const& svs, TJ
 	if (gqval[file_c] < 15) ftarr[file_c] = "LowQual";
 	else ftarr[file_c] = "PASS";
       }
-      rec->qual = svIter->mapq;
-
+      int32_t qvalout = svIter->mapq;
+      if (qvalout < 0) qvalout = 0;
+      if (qvalout > 10000) qvalout = 10000;
+      rec->qual = qvalout;
       
       bcf_update_genotypes(hdr, rec, gts, bcf_hdr_nsamples(hdr) * 2);
       bcf_update_format_float(hdr, rec, "GL",  gls, bcf_hdr_nsamples(hdr) * 3);


=====================================
src/needle.h
=====================================
@@ -268,6 +268,58 @@ namespace torali
     // Score
     return s[n];
   }
+
+
+  template<typename TAlignConfig, typename TScoreObject>
+  inline int32_t
+  needleBanded(std::string const& s1, std::string const& s2, TAlignConfig const& ac, TScoreObject const& sc)
+  {
+    typedef typename TScoreObject::TValue TScoreValue;
+
+    // DP Matrix
+    int32_t m = s1.size();
+    int32_t n = s2.size();
+    int32_t band = 100;
+    int32_t lowBand = band;
+    int32_t highBand = band;
+    if (m < n) highBand += n - m;
+    else lowBand += m - n;
+    std::vector<TScoreValue> s(n+1, 0);
+    TScoreValue prevsub = 0;
+    TScoreValue prevprevsub = 0;
+
+    // DP
+    for(int32_t row = 0; row <= m; ++row) {
+      for(int32_t col = std::max(0, row - lowBand); col <= std::min(n, row + highBand); ++col) {
+	// Initialization
+	if ((row == 0) && (col == 0)) {
+	  s[0] = 0;
+	  prevsub = 0;
+	} else if (row == 0) {
+	  s[col] = _horizontalGap(ac, 0, m, col * sc.ge);
+	} else if (col == 0) {
+	  s[0] = _verticalGap(ac, 0, n, row * sc.ge);
+	  if (row - 1 == 0) prevsub = 0;
+	  else prevsub = _verticalGap(ac, 0, n, (row - 1) * sc.ge);
+	} else {
+	  // Recursion
+	  prevprevsub = prevsub;
+	  prevsub = s[col];
+	  if (col == row - lowBand) {
+	    prevprevsub = s[col-1];
+	    s[col - 1] = DELLY_OUTOFBAND;
+	  } else if (col == row + highBand) prevsub = DELLY_OUTOFBAND;
+	  s[col] = std::max(std::max(prevprevsub + (s1[row-1] == s2[col-1] ? sc.match : sc.mismatch), prevsub + _verticalGap(ac, col, n, sc.ge)), s[col-1] + _horizontalGap(ac, row, m, sc.ge));
+	}
+      }
+    }
+	
+    // Score
+    return s[n];
+  }
+
+
+
   
   template<typename TAlign1, typename TAlign2, typename TAlign, typename TAlignConfig, typename TScoreObject>
   inline int


=====================================
src/scan.h
=====================================
@@ -0,0 +1,298 @@
+#ifndef SCAN_H
+#define SCAN_H
+
+#include <limits>
+
+#include <boost/icl/split_interval_map.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/date_time/posix_time/posix_time.hpp>
+#include <boost/date_time/gregorian/gregorian.hpp>
+#include <boost/progress.hpp>
+
+#include <htslib/sam.h>
+
+#include "version.h"
+#include "util.h"
+
+
+namespace torali
+{
+
+  struct ScanWindow {
+    bool select;
+    int32_t start;
+    int32_t end;
+    uint32_t cov;
+    uint32_t uniqcov;
+
+    ScanWindow() : select(false), start(0), end(0), cov(0), uniqcov(0) {}
+    explicit ScanWindow(int32_t const s) : select(false), start(s), end(s+1), cov(0), uniqcov(0) {}
+  };
+
+  template<typename TScanWindow>
+  struct SortScanWindow : public std::binary_function<TScanWindow, TScanWindow, bool>
+  {
+    inline bool operator()(TScanWindow const& sw1, TScanWindow const& sw2) {
+      return ((sw1.start<sw2.start) || ((sw1.start == sw2.start) && (sw1.end < sw2.end)));
+    }
+  };
+
+
+  template<typename TConfig>
+  inline int32_t
+  _findScanWindow(TConfig const& c, uint32_t const reflen, std::vector<uint16_t> const& binMap, int32_t const midPoint) {
+    if (c.hasScanFile) {
+      if (binMap[midPoint] == LAST_BIN) return -1;
+      else return binMap[midPoint];
+    } else {
+      uint32_t bin = midPoint / c.scanWindow;
+      uint32_t allbins = reflen / c.scanWindow;
+      if (bin >= allbins) return -1;
+      else return bin;
+    }
+    return -1;
+  }
+
+  
+  inline std::pair<uint32_t, uint32_t>
+  estCountBounds(std::vector< std::vector<ScanWindow> > const& scanCounts) {
+    std::vector<uint32_t> all;
+    for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) {
+      for(uint32_t i = 0; i<scanCounts[refIndex].size(); ++i) {
+	if (scanCounts[refIndex][i].select) all.push_back(scanCounts[refIndex][i].cov);
+      }
+    }
+    std::sort(all.begin(), all.end());
+    uint32_t median = all[all.size() / 2];
+    std::vector<uint32_t> absdev;
+    for(uint32_t i = 0; i<all.size(); ++i) absdev.push_back(std::abs((int32_t) all[i] - (int32_t) median));
+    std::sort(absdev.begin(), absdev.end());
+    uint32_t mad = absdev[absdev.size() / 2];
+    uint32_t lowerBound = 0;
+    if (mad < median) lowerBound = median - mad;
+    uint32_t upperBound = median + mad;
+    return std::make_pair(lowerBound, upperBound);
+  }
+
+  template<typename TConfig>
+  inline void
+  scan(TConfig const& c, LibraryInfo const& li, std::vector< std::vector<ScanWindow> >& scanCounts) {
+
+    // Load bam file
+    samFile* samfile = sam_open(c.bamFile.string().c_str(), "r");
+    hts_set_fai_filename(samfile, c.genome.string().c_str());
+    hts_idx_t* idx = sam_index_load(samfile, c.bamFile.string().c_str());
+    bam_hdr_t* hdr = sam_hdr_read(samfile);
+
+    // Pre-defined scanning windows
+    if (c.hasScanFile) {
+      typedef boost::icl::interval_set<uint32_t> TChrIntervals;
+      typedef std::vector<TChrIntervals> TRegionsGenome;
+      TRegionsGenome scanRegions;
+      if (!_parseBedIntervals(c.scanFile.string(), c.hasScanFile, hdr, scanRegions)) {
+	std::cerr << "Warning: Couldn't parse BED intervals. Do the chromosome names match?" << std::endl;
+      }
+      for (int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) {
+	for(typename TChrIntervals::iterator it = scanRegions[refIndex].begin(); it != scanRegions[refIndex].end(); ++it) {
+	  if (it->lower() < it->upper()) {
+	    if ((it->lower() >= 0) && (it->upper() < hdr->target_len[refIndex])) {
+	      ScanWindow sw;
+	      sw.start = it->lower();
+	      sw.end = it->upper();
+	      sw.select = true;
+	      scanCounts[refIndex].push_back(sw);
+	    }
+	  }
+	}
+	// Sort scan windows
+	sort(scanCounts[refIndex].begin(), scanCounts[refIndex].end(), SortScanWindow<ScanWindow>());
+      }
+    }
+    
+    // Parse BAM file
+    boost::posix_time::ptime now = boost::posix_time::second_clock::local_time();
+    std::cout << '[' << boost::posix_time::to_simple_string(now) << "] " << "Scanning Windows" << std::endl;
+    boost::progress_display show_progress( hdr->n_targets );
+
+    // Iterate chromosomes
+    uint64_t totalCov = 0;
+    faidx_t* faiMap = fai_load(c.mapFile.string().c_str());
+    for(int32_t refIndex=0; refIndex < (int32_t) hdr->n_targets; ++refIndex) {
+      ++show_progress;
+      if (chrNoData(c, refIndex, idx)) continue;
+      // Exclude small chromosomes
+      if ((hdr->target_len[refIndex] < c.minChrLen) && (totalCov > 1000000)) continue;
+      // Exclude sex chromosomes
+      if ((std::string(hdr->target_name[refIndex]) == "chrX") || (std::string(hdr->target_name[refIndex]) == "chrY") || (std::string(hdr->target_name[refIndex]) == "X") || (std::string(hdr->target_name[refIndex]) == "Y")) continue;
+
+      // Check presence in mappability map
+      std::string tname(hdr->target_name[refIndex]);
+      int32_t seqlen = faidx_seq_len(faiMap, tname.c_str());
+      if (seqlen == -1) continue;
+      else seqlen = -1;
+      char* seq = faidx_fetch_seq(faiMap, tname.c_str(), 0, faidx_seq_len(faiMap, tname.c_str()), &seqlen);
+
+      // Get Mappability
+      std::vector<uint16_t> uniqContent(hdr->target_len[refIndex], 0);
+      {
+	// Mappability map
+	typedef boost::dynamic_bitset<> TBitSet;
+	TBitSet uniq(hdr->target_len[refIndex], false);
+	for(uint32_t i = 0; i < hdr->target_len[refIndex]; ++i) {
+	  if (seq[i] == 'C') uniq[i] = 1;
+	}
+
+	// Sum across fragments
+	int32_t halfwin = (int32_t) (c.meanisize / 2);
+	int32_t usum = 0;
+	for(int32_t pos = halfwin; pos < (int32_t) hdr->target_len[refIndex] - halfwin; ++pos) {
+	  if (pos == halfwin) {
+	    for(int32_t i = pos - halfwin; i<=pos+halfwin; ++i) usum += uniq[i];
+	  } else {
+	    usum -= uniq[pos - halfwin - 1];
+	    usum += uniq[pos + halfwin];
+	  }
+	  uniqContent[pos] = usum;
+	}
+      }
+
+      // Bins on this chromosome
+      std::vector<uint16_t> binMap;
+      if (!c.hasScanFile) {
+	uint32_t allbins = hdr->target_len[refIndex] / c.scanWindow;
+	scanCounts[refIndex].resize(allbins, ScanWindow());
+	for(uint32_t i = 0; i < allbins; ++i) {
+	  scanCounts[refIndex][i].start = i * c.scanWindow;
+	  scanCounts[refIndex][i].end = (i+1) * c.scanWindow;
+	}
+      } else {
+	// Fill bin map
+	binMap.resize(hdr->target_len[refIndex], LAST_BIN);
+	if (scanCounts[refIndex].size() >= LAST_BIN) {
+	  std::cerr << "Warning: Too many scan windows on " << hdr->target_name[refIndex] << std::endl;
+	}
+	for(uint32_t bin = 0;((bin < scanCounts[refIndex].size()) && (bin < LAST_BIN)); ++bin) {
+	  for(int32_t k = scanCounts[refIndex][bin].start; k < scanCounts[refIndex][bin].end; ++k) binMap[k] = bin;
+	}
+      }
+	
+      // Mate map
+      typedef boost::unordered_map<std::size_t, bool> TMateMap;
+      TMateMap mateMap;
+
+      // Count reads
+      hts_itr_t* iter = sam_itr_queryi(idx, refIndex, 0, hdr->target_len[refIndex]);
+      bam1_t* rec = bam_init1();
+      int32_t lastAlignedPos = 0;
+      std::set<std::size_t> lastAlignedPosReads;
+      while (sam_itr_next(samfile, iter, rec) >= 0) {
+	if (rec->core.flag & (BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP | BAM_FSUPPLEMENTARY | BAM_FUNMAP)) continue;
+	if ((rec->core.flag & BAM_FPAIRED) && ((rec->core.flag & BAM_FMUNMAP) || (rec->core.tid != rec->core.mtid))) continue;
+	if (rec->core.qual < c.minQual) continue;
+	if (getSVType(rec->core) != 2) continue;
+
+	int32_t midPoint = rec->core.pos + halfAlignmentLength(rec);
+	if (rec->core.flag & BAM_FPAIRED) {
+	  // Clean-up the read store for identical alignment positions
+	  if (rec->core.pos > lastAlignedPos) {
+	    lastAlignedPosReads.clear();
+	    lastAlignedPos = rec->core.pos;
+	  }
+	
+	  if ((rec->core.pos < rec->core.mpos) || ((rec->core.pos == rec->core.mpos) && (lastAlignedPosReads.find(hash_string(bam_get_qname(rec))) == lastAlignedPosReads.end()))) {
+	    // First read
+	    lastAlignedPosReads.insert(hash_string(bam_get_qname(rec)));
+	    std::size_t hv = hash_pair(rec);
+	    mateMap[hv] = true;
+	    continue;
+	  } else {
+	    // Second read
+	    std::size_t hv = hash_pair_mate(rec);
+	    if ((mateMap.find(hv) == mateMap.end()) || (!mateMap[hv])) continue; // Mate discarded
+	    mateMap[hv] = false;
+	  }
+
+	  // Insert size filter
+	  int32_t isize = (rec->core.pos + alignmentLength(rec)) - rec->core.mpos;
+	  if ((li.minNormalISize < isize) && (isize < li.maxNormalISize)) midPoint = rec->core.mpos + (int32_t) (isize/2);
+	  else continue;
+	}
+
+	// Count fragment
+	if ((midPoint >= 0) && (midPoint < (int32_t) hdr->target_len[refIndex])) {
+	  int32_t bin = _findScanWindow(c, hdr->target_len[refIndex], binMap, midPoint);
+	  if (bin >= 0) {
+	    ++scanCounts[refIndex][bin].cov;
+	    
+	    if (uniqContent[midPoint] >= c.fragmentUnique * c.meanisize) ++scanCounts[refIndex][bin].uniqcov;
+	    ++totalCov;
+	  }
+	}
+      }
+      // Clean-up
+      bam_destroy1(rec);
+      hts_itr_destroy(iter);
+      if (seq != NULL) free(seq);
+    }
+    
+    // clean-up
+    fai_destroy(faiMap);
+    bam_hdr_destroy(hdr);
+    hts_idx_destroy(idx);
+    sam_close(samfile);
+  }
+
+
+  template<typename TConfig>
+  inline void
+  selectWindows(TConfig const& c, std::vector< std::vector<ScanWindow> >& scanCounts) {
+    if (c.noScanWindowSelection) {
+      // Select all windows
+      for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) {
+	for(uint32_t i = 0; i<scanCounts[refIndex].size(); ++i) {
+	  scanCounts[refIndex][i].select = true;
+	}
+      }
+    } else {
+      // Pre-screen using PE layout, uniqueness and percent identity
+      for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) {
+	for(uint32_t i = 0; i<scanCounts[refIndex].size(); ++i) {
+	  // Uniqueness
+	  double uniqratio = 0;
+	  if (scanCounts[refIndex][i].cov > 0) uniqratio = (double) scanCounts[refIndex][i].uniqcov / scanCounts[refIndex][i].cov;
+	  if (uniqratio > c.uniqueToTotalCovRatio) scanCounts[refIndex][i].select = true;
+	  else scanCounts[refIndex][i].select = false;
+	}
+      }
+      
+      // Normalize user-defined scan windows to same length (10,000bp)
+      if (c.hasScanFile) {
+	for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) {
+	  for(uint32_t i = 0; i<scanCounts[refIndex].size(); ++i) {
+	    double scale = (double) 10000 / (double) (scanCounts[refIndex][i].end - scanCounts[refIndex][i].start);
+	    scanCounts[refIndex][i].uniqcov *= scale;
+	    scanCounts[refIndex][i].cov *= scale;
+	  }
+	}
+      }
+      
+      // Get "normal" coverage windows of CN2
+      typedef std::pair<uint32_t, uint32_t> TCountBounds;
+      TCountBounds cb = estCountBounds(scanCounts);
+      
+      // Select CN2 windows
+      for(uint32_t refIndex = 0; refIndex < scanCounts.size(); ++refIndex) {
+	for(uint32_t i = 0; i<scanCounts[refIndex].size(); ++i) {
+	  if (scanCounts[refIndex][i].select) {
+	    if ((scanCounts[refIndex][i].cov > cb.first) && (scanCounts[refIndex][i].cov < cb.second)) scanCounts[refIndex][i].select = true;
+	    else scanCounts[refIndex][i].select = false;
+	  }
+	}
+      }
+    }
+  }
+  
+}
+
+#endif


=====================================
src/tags.h
=====================================
@@ -11,6 +11,10 @@ namespace torali {
   #define DELLY_CHOP_REFSIZE 1000
   #endif
 
+  #ifndef DELLY_OUTOFBAND
+  #define DELLY_OUTOFBAND -99999999
+  #endif
+
   inline bool
   _translocation(int32_t const svt) {
     return (DELLY_SVT_TRANS <= svt);
@@ -145,8 +149,8 @@ namespace torali {
   inline bool
   _svSizeCheck(TSize const s, TSize const e, int32_t const svt) {
     // Short reads
-    if (svt == 0) return (( e - s ) >= 100);
-    else if (svt == 1) return (( e - s ) >= 100);
+    if (svt == 0) return (( e - s ) >= 300);
+    else if (svt == 1) return (( e - s ) >= 300);
     else if (svt == 2) return (( e - s ) >= 300);
     else if (svt == 3) return (( e - s ) >= 100);
     else return true;


=====================================
src/tegua.h
=====================================
@@ -45,6 +45,7 @@ namespace torali {
     uint32_t minRefSep;
     uint32_t maxReadSep;
     uint32_t graphPruning;
+    uint32_t minCliqueSize;
     int32_t nchr;
     int32_t minimumFlankSize;
     float indelExtension;
@@ -118,7 +119,7 @@ namespace torali {
 
      // SV Discovery
      _clusterSRReads(c, validRegions, svc, tmpStore);
-
+     
      // Assemble
      assemble(c, validRegions, svc, tmpStore);
 
@@ -246,6 +247,7 @@ namespace torali {
    disc.add_options()
      ("mapqual,q", boost::program_options::value<uint16_t>(&c.minMapQual)->default_value(10), "min. mapping quality")
      ("minclip,c", boost::program_options::value<uint32_t>(&c.minClip)->default_value(25), "min. clipping length")
+     ("min-clique-size,z", boost::program_options::value<uint32_t>(&c.minCliqueSize)->default_value(2), "min. clique size")     
      ("minrefsep,m", boost::program_options::value<uint32_t>(&c.minRefSep)->default_value(30), "min. reference separation")
      ("maxreadsep,n", boost::program_options::value<uint32_t>(&c.maxReadSep)->default_value(75), "max. read separation")
      ;
@@ -296,6 +298,9 @@ namespace torali {
    if (vm.count("dump")) c.hasDumpFile = true;
    else c.hasDumpFile = false;
 
+   // Clique size
+   if (c.minCliqueSize < 2) c.minCliqueSize = 2;
+
    // Check reference
    if (!(boost::filesystem::exists(c.genome) && boost::filesystem::is_regular_file(c.genome) && boost::filesystem::file_size(c.genome))) {
      std::cerr << "Reference file is missing: " << c.genome.string() << std::endl;


=====================================
src/util.h
=====================================
@@ -16,6 +16,10 @@
 namespace torali
 {
 
+  #ifndef LAST_BIN
+  #define LAST_BIN 65535
+  #endif
+  
   struct LibraryInfo {
     int32_t rs;
     int32_t median;
@@ -41,7 +45,13 @@ namespace torali
   };
 
 
-
+  inline bool
+  nContent(std::string const& s) {
+    for(uint32_t i = 0; i < s.size(); ++i) {
+      if ((s[i] == 'N') || (s[i] == 'n')) return true;
+    }
+    return false;
+  }
   
   // Decode Orientation
   inline int32_t
@@ -199,7 +209,7 @@ namespace torali
     return sequenceLength(rec);
   }
     
-  inline uint32_t alignmentLength(bam1_t* rec) {
+  inline uint32_t alignmentLength(bam1_t const* rec) {
     uint32_t* cigar = bam_get_cigar(rec);
     uint32_t alen = 0;
     for (uint32_t i = 0; i < rec->core.n_cigar; ++i)
@@ -207,10 +217,17 @@ namespace torali
     return alen;
   }
 
-  inline uint32_t halfAlignmentLength(bam1_t* rec) {
+  inline uint32_t halfAlignmentLength(bam1_t const* rec) {
     return (alignmentLength(rec) / 2);
   }
 
+  inline uint32_t
+  lastAlignedPosition(bam1_t const* rec) {
+    return rec->core.pos + alignmentLength(rec);
+  }
+
+
+
   inline std::size_t hash_lr(bam1_t* rec) {
     boost::hash<std::string> string_hash;
     std::string qname = bam_get_qname(rec);
@@ -293,6 +310,47 @@ namespace torali
   }
 
 
+  inline uint32_t
+  setMinChrLen(bam_hdr_t const* hdr, double const xx) {
+    uint32_t minChrLen = 0;
+    std::vector<uint32_t> chrlen(hdr->n_targets, 0);
+    uint64_t genomelen = 0;
+    for(int32_t refIndex = 0; refIndex < hdr->n_targets; ++refIndex) {
+      chrlen[refIndex] = hdr->target_len[refIndex];
+      genomelen += hdr->target_len[refIndex];
+    }
+    std::sort(chrlen.begin(), chrlen.end(), std::greater<uint32_t>());
+    uint64_t cumsum = 0;
+    for(uint32_t i = 0; i < chrlen.size(); ++i) {
+      cumsum += chrlen[i];
+      minChrLen = chrlen[i];
+      if (cumsum > genomelen * xx) break;
+    }
+    return minChrLen;
+  }
+  
+  
+  template<typename TConfig>
+  inline bool
+  chrNoData(TConfig const& c, uint32_t const refIndex, hts_idx_t const* idx) {
+    // Check we have mapped reads on this chromosome
+    std::string suffix("cram");
+    std::string str(c.bamFile.string());
+    if ((str.size() >= suffix.size()) && (str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0)) return false;
+    uint64_t mapped = 0;
+    uint64_t unmapped = 0;
+    hts_idx_get_stat(idx, refIndex, &mapped, &unmapped);
+    if (mapped) return false;
+    else return true;
+  }    
+  
+  inline std::size_t hash_se(bam1_t* rec) {
+    std::size_t seed = hash_string(bam_get_qname(rec));
+    boost::hash_combine(seed, rec->core.tid);
+    boost::hash_combine(seed, rec->core.pos);
+    return seed;
+  }
+  
   inline void
   getSMTag(std::string const& header, std::string const& fileName, std::string& sampleName) {
     std::set<std::string> smIdentifiers;
@@ -564,8 +622,8 @@ namespace torali
 	  } else {
 	    sampleLib[file_c].median = median;
 	    sampleLib[file_c].mad = mad;
-	    sampleLib[file_c].maxNormalISize = median + (5 * mad);
-	    sampleLib[file_c].minNormalISize = median - (5 * mad);
+	    sampleLib[file_c].maxNormalISize = median + (c.madNormalCutoff * mad);
+	    sampleLib[file_c].minNormalISize = median - (c.madNormalCutoff * mad);
 	    if (sampleLib[file_c].minNormalISize < 0) sampleLib[file_c].minNormalISize=0;
 	    sampleLib[file_c].maxISizeCutoff = median + (c.madCutoff * mad);
 	    sampleLib[file_c].minISizeCutoff = median - (c.madCutoff * mad);


=====================================
src/version.h
=====================================
@@ -5,7 +5,7 @@ namespace torali
 {
 
 
-  std::string dellyVersionNumber = "0.8.3";
+  std::string dellyVersionNumber = "0.8.5";
 
   inline 
     void printTitle(std::string const& title) 



View it on GitLab: https://salsa.debian.org/med-team/delly/-/commit/77fba0a1f8ba40e9e450e816f55ca45ca3013089

-- 
View it on GitLab: https://salsa.debian.org/med-team/delly/-/commit/77fba0a1f8ba40e9e450e816f55ca45ca3013089
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201002/c682c418/attachment-0001.html>


More information about the debian-med-commit mailing list