[med-svn] [Git][med-team/tvc][master] 12 commits: Add watch file using Git mode, Remove code copy of libsmithwaterman
Andreas Tille
gitlab at salsa.debian.org
Thu Sep 27 09:15:22 BST 2018
Andreas Tille pushed to branch master at Debian Med / tvc
Commits:
ca31db55 by Andreas Tille at 2018-09-27T07:37:23Z
Add watch file using Git mode, Remove code copy of libsmithwaterman
- - - - -
70e795ac by Andreas Tille at 2018-09-27T07:38:38Z
Add full file exclusion list used in former packaging
- - - - -
21d5d20b by Andreas Tille at 2018-09-27T07:38:38Z
New upstream version 5.0.3+git20151221.80e144e+dfsg
- - - - -
6333ae25 by Andreas Tille at 2018-09-27T07:54:04Z
Use Debian packaged libsmithwaterman
- - - - -
1fba086f by Andreas Tille at 2018-09-27T07:57:59Z
hardening=+all
- - - - -
efe8db80 by Andreas Tille at 2018-09-27T07:59:41Z
Update copyright
- - - - -
68ad33f0 by Andreas Tille at 2018-09-27T08:00:24Z
debhelper 11
- - - - -
4dd922af by Andreas Tille at 2018-09-27T08:00:27Z
Point Vcs fields to salsa.debian.org
- - - - -
d2200ca9 by Andreas Tille at 2018-09-27T08:00:27Z
Standards-Version: 4.2.1
- - - - -
e11696ca by Andreas Tille at 2018-09-27T08:06:37Z
Fix duplicated license
- - - - -
931f804b by Andreas Tille at 2018-09-27T08:08:58Z
Ignore script-with-language-extension as per consensus on Debian Med list
- - - - -
f5165aac by Andreas Tille at 2018-09-27T08:12:15Z
Upload to unstable
- - - - -
26 changed files:
- + README.md
- + README.txt
- debian/changelog
- debian/compat
- debian/control
- debian/copyright
- + debian/lintian-overrides
- debian/patches/series
- + debian/patches/use_debian_packaged_smithwaterman.patch
- debian/rules
- + debian/watch
- − external/vcflib/smithwaterman/BandedSmithWaterman.cpp
- − external/vcflib/smithwaterman/BandedSmithWaterman.h
- − external/vcflib/smithwaterman/IndelAllele.cpp
- − external/vcflib/smithwaterman/IndelAllele.h
- − external/vcflib/smithwaterman/LeftAlign.cpp
- − external/vcflib/smithwaterman/LeftAlign.h
- − external/vcflib/smithwaterman/Makefile
- − external/vcflib/smithwaterman/Mosaik.h
- − external/vcflib/smithwaterman/Repeats.cpp
- − external/vcflib/smithwaterman/Repeats.h
- − external/vcflib/smithwaterman/SWMain.cpp
- − external/vcflib/smithwaterman/SmithWatermanGotoh.cpp
- − external/vcflib/smithwaterman/SmithWatermanGotoh.h
- − external/vcflib/smithwaterman/convert.h
- − external/vcflib/smithwaterman/smithwaterman.cpp
Changes:
=====================================
README.md
=====================================
@@ -0,0 +1,165 @@
+# Ion Torrent Variant Caller
+
+Ion Torrent Variant Caller (TVC) is a genetic variant caller for Ion Torrent sequencing platforms,
+and is specially optimized to exploit the underlying flow signal information in the statistical model
+to evaluate variants. Torrent Variant Caller is designed to call single-nucleotide polymorphisms (SNPs),
+multi-nucleotide polymorphisms (MNPs), insertions, deletions, and block substitutions.
+
+### Quick Start
+```
+git clone https://github.com/domibel/IonTorrent-VariantCaller.git
+
+BASE=`pwd`
+TVC_SOURCE_DIR=$BASE/IonTorrent-VariantCaller
+
+*** adjust build and install directory ***
+TVC_BUILD_DIR=$BASE/IonTorrent-VariantCaller-build
+TVC_INSTALL_DIR=$BASE/IonTorrent-VariantCaller-install/usr/local
+
+mkdir -p $TVC_BUILD_DIR
+mkdir -p $TVC_INSTALL_DIR
+
+cd $TVC_BUILD_DIR
+wget --no-clobber updates.iontorrent.com/updates/software/external/armadillo-4.600.1.tar.gz
+tar xvzf armadillo-4.600.1.tar.gz
+cd armadillo-4.600.1/
+sed -i 's:^// #define ARMA_USE_LAPACK$:#define ARMA_USE_LAPACK:g' include/armadillo_bits/config.hpp
+sed -i 's:^// #define ARMA_USE_BLAS$:#define ARMA_USE_BLAS:g' include/armadillo_bits/config.hpp
+cmake .
+make -j4
+
+cd $TVC_BUILD_DIR
+wget --no-clobber updates.iontorrent.com/updates/software/external/bamtools-2.4.0.20150702+git15eadb925f.tar.gz
+tar xvzf bamtools-2.4.0.20150702+git15eadb925f.tar.gz
+mkdir bamtools-2.4.0.20150702+git15eadb925f-build
+cd bamtools-2.4.0.20150702+git15eadb925f-build
+cmake ../bamtools-2.4.0.20150702+git15eadb925f -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
+make -j4
+
+cd $TVC_BUILD_DIR
+wget --no-clobber --no-check-certificate https://github.com/samtools/htslib/archive/1.2.1.tar.gz -O htslib-1.2.1.tar.gz
+tar xvzf htslib-1.2.1.tar.gz
+ln -s htslib-1.2.1 htslib # for samtools
+cd htslib-1.2.1
+make -j4
+
+cd $TVC_BUILD_DIR
+wget --no-clobber --no-check-certificate https://github.com/samtools/samtools/archive/1.2.tar.gz -O samtools-1.2.tar.gz
+tar xvzf samtools-1.2.tar.gz
+cd samtools-1.2
+make -j4
+mkdir $TVC_INSTALL_DIR/bin
+cp samtools $TVC_INSTALL_DIR/bin/
+
+cd $TVC_BUILD_DIR
+mkdir TVC-build
+cd TVC-build
+cmake $TVC_SOURCE_DIR -DCMAKE_INSTALL_PREFIX:PATH=$TVC_INSTALL_DIR -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
+make -j4 install
+
+*** quick test ***
+TVC_ROOT_DIR=$TVC_INSTALL_DIR
+$TVC_ROOT_DIR/bin/variant_caller_pipeline.py \
+ --input-bam $TVC_ROOT_DIR/share/TVC/examples/example1/test.bam \
+ --reference-fasta $TVC_ROOT_DIR/share/TVC/examples/example1/reference.fasta \
+ --region-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_merged_plain.bed \
+ --primer-trim-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_unmerged_detail.bed
+
+*** export PATH, following tools are required: samtools zip tvc ***
+export PATH=$PATH:$TVC_ROOT_DIR/bin
+```
+
+
+### Get reference genome hg19 from Ion Torrent, Size: 868170684 (828M)
+
+##### Download zip archive
+```
+$ wget http://ionupdates.com/reference/hg19.zip
+```
+
+##### Unpack zip archive
+```
+$ unzip hg19.zip
+```
+
+##### Create hg19.fasta.fai
+```
+$ samtools faidx hg19.fasta
+```
+
+##### Content
+```
+$ md5sum hg19.zip hg19.fasta hg19.fasta.fai
+21afdf14b6b3734e0d25157e60bdfb24 hg19.zip
+d6851f9f4537ff4e9beb5b7a08b89230 hg19.fasta
+cdcd61262b6f4af8e8ac6c2be88a161a hg19.fasta.fai
+
+$ ls -l
+-rw-r--r-- 1 ionadmin ionadmin 3147289017 Mar 22 2011 hg19.fasta
+-rw-rw-r-- 1 ionadmin ionadmin 788 Nov 18 16:18 hg19.fasta.fai
+-rw-rw-r-- 1 ionadmin ionadmin 868170684 Jul 8 2014 hg19.zip
+```
+
+### Examples
+
+#### 316 - H. sapiens - AmpliSeq BRCA1/BRCA2 Community Panel
+##### https://ioncommunity.thermofisher.com/docs/DOC-7515
+##### CN:TorrentServer/BRCA PL:IONTORRENT PU:PGM/316D/IonXpress_009
+```
+wget http://ion-torrent.s3.amazonaws.com/pgm/BRCArun94/BRCArun94IonXpress_009_rawlib.bam # 163535738 156M
+wget http://ion-torrent.s3.amazonaws.com/pgm/BRCArun94/BRCArun94IonXpress_009_rawlib.bam.bai # 1675968 1.6M
+wget http://ion-torrent.s3.amazonaws.com/pgm/BRCArun94/BRCArun94TSVC_variants_IonXpress_009.vcf # 85850 84K
+
+variant_caller_pipeline.py \
+ --input-bam BRCArun94IonXpress_009_rawlib.bam \
+ --reference-fasta hg19.fasta \
+ --parameters-file=/usr/share/TVC/pluginMedia/parameter_sets/ampliseq_somatic_lowstringency_pgm_parameters.json \
+ --generate-gvcf=on \
+ --num-threads=4
+```
+
+
+#### Alternative: Get reference genome hg19 with TMAP index files from Ion Torrent, Size: 4613331358 (4.3G)
+
+```
+$ wget http://ionupdates.com/reference_downloads/hg19.zip
+$ unzip hg19.zip
+```
+
+##### Content
+```
+$ md5sum *
+d2dacce998d72687ca1f2dd28ce2280e CreateSequenceDictionary.log
+d264f416c20f7d144d74924ab7c356a0 hg19.dict
+d6851f9f4537ff4e9beb5b7a08b89230 hg19.fasta
+cdcd61262b6f4af8e8ac6c2be88a161a hg19.fasta.fai
+6903f05eef92e5e6f5a6bf890a01d6e6 hg19.fasta.tmap.anno
+f9d7c7dedf322022a0393d99a629229b hg19.fasta.tmap.bwt
+19c79b65021812df67904f7448367d5f hg19.fasta.tmap.pac
+ebd201bec6945900ce9f26b62cf89915 hg19.fasta.tmap.sa
+09b3113801b9c6ae69134f8712ef6a8c hg19.info.txt
+3bc8b68ce604c77f2fecd14ab20a13bc hg19.md5sum.txt
+3b725f35b395cf763bb7ac2110ab8d1a hg19.zip
+d41d8cd98f00b204e9800998ecf8427e samtools.log
+b9ba9df7fc3cd18883e7922bc6dd20d6 tmap.log
+```
+```
+$ ls -l
+total 13218824
+-rwxr-xr-x 1 ionadmin ionadmin 624 May 6 2013 CreateSequenceDictionary.log
+-rwxr-xr-x 1 ionadmin ionadmin 3174 May 6 2013 hg19.dict
+-rwxr-xr-x 1 ionadmin ionadmin 3147289017 May 6 2013 hg19.fasta
+-rwxr-xr-x 1 ionadmin ionadmin 788 May 6 2013 hg19.fasta.fai
+-rwxr-xr-x 1 ionadmin ionadmin 3985 May 6 2013 hg19.fasta.tmap.anno
+-rwxr-xr-x 1 ionadmin ionadmin 3453608024 May 6 2013 hg19.fasta.tmap.bwt
+-rwxr-xr-x 1 ionadmin ionadmin 773923497 May 6 2013 hg19.fasta.tmap.pac
+-rwxr-xr-x 1 ionadmin ionadmin 1547847008 May 6 2013 hg19.fasta.tmap.sa
+-rwxr-xr-x 1 ionadmin ionadmin 92 May 6 2013 hg19.info.txt
+-rwxr-xr-x 1 ionadmin ionadmin 642 May 6 2013 hg19.md5sum.txt
+-rw-rw-r-- 1 ionadmin ionadmin 4613331358 Jul 8 2014 hg19.zip
+-rwxr-xr-x 1 ionadmin ionadmin 0 May 6 2013 samtools.log
+-rwxr-xr-x 1 ionadmin ionadmin 15170 May 6 2013 tmap.log
+```
+
+
+See also https://ioncommunity.thermofisher.com/community/products/software/torrent-variant-caller
=====================================
README.txt
=====================================
@@ -0,0 +1,198 @@
+Compilation instructions for TVC
+
+# Steps 1 through 11 explain how to compile TVC.
+# Step 12 onward explain how to deploy and run the compiled version on a single computer or cluster.
+
+For more information see:
+http://ioncommunity.lifetechnologies.com/community/products/torrent-variant-caller
+
+# 1. Download source file
+
+wget updates.iontorrent.com/tvc_standalone/tvc-5.0.3.tar.gz
+
+
+# 2. Copy source file into build root directory
+
+TVC_VERSION=tvc-5.0.3
+
+BUILD_ROOT_DIR=`mktemp -d`
+cp $TVC_VERSION.tar.gz $BUILD_ROOT_DIR
+DISTRIBUTION_CODENAME=`lsb_release -is`_`lsb_release -rs`_`uname -m`
+TVC_INSTALL_DIR=$BUILD_ROOT_DIR/$TVC_VERSION-$DISTRIBUTION_CODENAME-binary
+mkdir -p $TVC_INSTALL_DIR/bin/
+
+# 3. Install dependencies
+
+# 3.1 RedHat/CentOS
+
+yum -y install gcc-c++ cmake zlib-devel bzip2-devel bzip2 \
+ncurses-devel python-simplejson atlas-devel blas-devel lapack-devel redhat-lsb-core boost-devel \
+# perl perl-Data-Dumper # with version 7
+
+
+# 3.2 Debian/Ubuntu
+
+sudo aptitude install g++ cmake zlib1g-dev libbz2-dev libncurses-dev libboost-math-dev \
+libatlas-dev liblapack-dev
+
+
+# 3.3 cmake
+
+# Required is cmake (>=2.8.0), you can check the installed version with e.g.:
+
+# $ cmake -version
+# cmake version 2.8.0
+
+# Installation of a newer cmake version is only required if the installed version is older than 2.8.0
+
+# To delete the old cmake package:
+
+# Redhat/CentOS:
+yum -y erase cmake
+
+# Debian/Ubuntu:
+aptitude purge cmake
+
+cd ~
+wget http://www.cmake.org/files/v2.8/cmake-2.8.11.2.tar.gz
+tar xvzf cmake-2.8.11.2.tar.gz
+cd cmake-2.8.11.2
+./configure
+make -j5
+make install
+
+
+# 4. build armadillo
+cd $BUILD_ROOT_DIR
+wget http://updates.iontorrent.com/updates/software/external/armadillo-4.600.1.tar.gz
+tar xvzf armadillo-4.600.1.tar.gz
+cd armadillo-4.600.1/
+sed -i 's:^// #define ARMA_USE_LAPACK$:#define ARMA_USE_LAPACK:g' include/armadillo_bits/config.hpp
+sed -i 's:^// #define ARMA_USE_BLAS$:#define ARMA_USE_BLAS:g' include/armadillo_bits/config.hpp
+cmake .
+make -j4
+
+
+# 5. build bamtools
+cd $BUILD_ROOT_DIR
+wget updates.iontorrent.com/updates/software/external/bamtools-2.4.0.20150702+git15eadb925f.tar.gz
+tar xvzf bamtools-2.4.0.20150702+git15eadb925f.tar.gz
+mkdir bamtools-2.4.0.20150702+git15eadb925f-build
+cd bamtools-2.4.0.20150702+git15eadb925f-build
+cmake ../bamtools-2.4.0.20150702+git15eadb925f -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
+make -j4
+
+
+# 7. build htslib
+cd $BUILD_ROOT_DIR
+wget --no-check-certificate https://github.com/samtools/htslib/archive/1.2.1.tar.gz -O htslib-1.2.1.tar.gz
+tar xvzf htslib-1.2.1.tar.gz
+ln -s htslib-1.2.1 htslib # for samtools
+cd htslib-1.2.1
+make -j4
+
+
+# 8. build samtools
+cd $BUILD_ROOT_DIR
+wget --no-check-certificate https://github.com/samtools/samtools/archive/1.2.tar.gz -O samtools-1.2.tar.gz
+tar xvzf samtools-1.2.tar.gz
+cd samtools-1.2
+make -j4
+cp samtools $TVC_INSTALL_DIR/bin/
+
+
+# 10. build TVC
+cd $BUILD_ROOT_DIR
+tar xvzf $TVC_VERSION.tar.gz
+TVC_SOURCE_DIR=$BUILD_ROOT_DIR/$TVC_VERSION
+mkdir $TVC_VERSION-build
+cd $TVC_VERSION-build
+cmake $TVC_SOURCE_DIR -DCMAKE_INSTALL_PREFIX:PATH=$TVC_INSTALL_DIR -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
+make -j4 install
+
+# 11. create binary package
+tar cvzf $TVC_VERSION-$DISTRIBUTION_CODENAME-binary.tar.gz -C $BUILD_ROOT_DIR $TVC_VERSION-$DISTRIBUTION_CODENAME-binary
+
+
+######################################################################################
+
+# 12.1 Either use the TVC version from the (temporary) TVC_INSTALL_DIR directory
+
+TVC_ROOT_DIR=$TVC_INSTALL_DIR
+
+
+# 12.2 or use the TVC binary package
+
+tar xvzf $TVC_VERSION-$DISTRIBUTION_CODENAME-binary.tar.gz
+TVC_ROOT_DIR=`pwd`/$TVC_VERSION-$DISTRIBUTION_CODENAME-binary
+
+
+# 13. export PATH, following tools are required: samtools zip tvc
+export PATH=$PATH:$TVC_ROOT_DIR/bin
+
+
+# 14. adjust some file paths and invoke TVC
+
+# Required are 1 reference, 2 bed files, 1 aligned bam file, and 1 tvc parameter file
+
+# Example 1:
+
+$TVC_ROOT_DIR/bin/variant_caller_pipeline.py \
+ --input-bam $TVC_ROOT_DIR/share/TVC/examples/example1/test.bam \
+ --reference-fasta $TVC_ROOT_DIR/share/TVC/examples/example1/reference.fasta \
+ --region-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_merged_plain.bed \
+ --primer-trim-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_unmerged_detail.bed
+
+# Example 1 with specified parameter file and output directory:
+
+$TVC_ROOT_DIR/bin/variant_caller_pipeline.py \
+ --input-bam $TVC_ROOT_DIR/share/TVC/examples/example1/test.bam \
+ --reference-fasta $TVC_ROOT_DIR/share/TVC/examples/example1/reference.fasta \
+ --region-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_merged_plain.bed \
+ --primer-trim-bed $TVC_ROOT_DIR/share/TVC/examples/example1/test_unmerged_detail.bed \
+ --parameters-file $TVC_ROOT_DIR/share/TVC/pluginMedia/parameter_sets/ampliseq_somatic_lowstringency_pgm_parameters.json \
+ --output-dir /tmp/tvc_example1
+
+
+# Example 2 (not included yet) :
+
+# Required are 1 reference, 2 bed files, 1 aligned bam file, and 1 tvc parameter file
+ REF=/mnt/TS/source/hg19/hg19.fasta
+BED_UNMERGED_DETAIL=/mnt/TS/source/tvc_test_GB1-118-CSI/unmerged/detail/Exome_draft_Designed_20130531.bed
+ BED_MERGED_PLAIN=/mnt/TS/source/tvc_test_GB1-118-CSI/merged/plain/Exome_draft_Designed_20130531.bed
+ BAM=/mnt/TS/source/tvc_test_GB1-118-CSI/IonXpress_045_rawlib.bam
+ TVC_PARAM=$TVC_ROOT_DIR/share/TVC/pluginMedia/configs/germline_low_stringency_proton.json
+
+TMP_DIR=`mktemp -d`
+
+$TVC_ROOT_DIR/bin/variant_caller_pipeline.py \
+ --parameters-file $TVC_PARAM \
+ --input-bam $BAM \
+ --reference-fasta $REF \
+ --region-bed $BED_MERGED_PLAIN \
+ --primer-trim-bed $BED_UNMERGED_DETAIL \
+ --postprocessed-bam $TMP_DIR/trimmed.bam \
+ --output-dir $TMP_DIR
+
+
+# Example 3 (hotspot) (not included yet):
+
+ REF=/mnt/TS/source/hg19/hg19.fasta
+BED_UNMERGED_DETAIL=/mnt/TS/source/tvc_test_Z06-506-CCP/unmerged_detail_CCP.20131001.designed.bed
+ BED_MERGED_PLAIN=/mnt/TS/source/tvc_test_Z06-506-CCP/merged_plain_CCP.20131001.designed.bed
+ BAM=/mnt/TS/source/tvc_test_Z06-506-CCP/IonXpress_019_rawlib.bam
+ TVC_PARAM=/mnt/TS/source/tvc_test_Z06-506-CCP/local_parameters.json
+ HOTSPOT=/mnt/TS/source/tvc_test_Z06-506-CCP/hotspot.vcf
+
+TMP_DIR=`mktemp -d`
+
+time $TVC_ROOT_DIR/bin/variant_caller_pipeline.py \
+ --parameters-file $TVC_PARAM \
+ --input-bam $BAM \
+ --reference-fasta $REF \
+ --region-bed $BED_MERGED_PLAIN \
+ --primer-trim-bed $BED_UNMERGED_DETAIL \
+ --hotspot-vcf $HOTSPOT \
+ --postprocessed-bam $TMP_DIR/trimmed.bam \
+ --output-dir $TMP_DIR
+
=====================================
debian/changelog
=====================================
@@ -1,9 +1,22 @@
-tvc (5.0.3+dfsg-3) UNRELEASED; urgency=medium
+tvc (5.0.3+git20151221.80e144e+dfsg-1) unstable; urgency=medium
+ [ Steffen Moeller ]
* Added notion of lacking entries in research software
registries in debian/upstream/metadata
- -- Steffen Moeller <moeller at debian.org> Thu, 14 Sep 2017 09:39:02 +0200
+ [ Andreas Tille ]
+ * Add watch file using Git mode
+ * Remove code copy of libsmithwaterman, mention all other files that
+ were previously removed in d/copyright
+ * Use Debian packaged libsmithwaterman
+ * hardening=+all
+ * debhelper 11
+ * Point Vcs fields to salsa.debian.org
+ * Standards-Version: 4.2.1
+ * Ignore script-with-language-extension as per consensus on Debian Med
+ list
+
+ -- Andreas Tille <tille at debian.org> Thu, 27 Sep 2018 10:09:12 +0200
tvc (5.0.3+dfsg-2) experimental; urgency=medium
=====================================
debian/compat
=====================================
@@ -1 +1 @@
-10
+11
=====================================
debian/control
=====================================
@@ -4,15 +4,16 @@ Uploaders: Dominique Belhachemi <domibel at debian.org>,
Andreas Tille <tille at debian.org>
Section: science
Priority: optional
-Build-Depends: debhelper (>= 10),
+Build-Depends: debhelper (>= 11~),
cmake,
libbamtools-dev,
libjsoncpp-dev,
libarmadillo-dev,
- libboost-math-dev
-Standards-Version: 3.9.8
-Vcs-Browser: https://anonscm.debian.org/cgit/debian-med/tvc.git
-Vcs-Git: https://anonscm.debian.org/git/debian-med/tvc.git
+ libboost-math-dev,
+ libsmithwaterman-dev
+Standards-Version: 4.2.1
+Vcs-Browser: https://salsa.debian.org/med-team/tvc
+Vcs-Git: https://salsa.debian.org/med-team/tvc.git
Homepage: http://ioncommunity.thermofisher.com
Package: tvc
=====================================
debian/copyright
=====================================
@@ -1,10 +1,42 @@
-Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: tvc
-Source: https://github.com/iontorrent
- http://updates.iontorrent.com/tvc_standalone/ (Dominique Belhachemi <domibel at debian.org>, Fri, 13 Jan 2017 10:11:18 -0500)
+Source: https://github.com/domibel/IonTorrent-VariantCaller
+Files-Excluded: */smithwaterman
+ */jsoncpp-src*
+ */.gitignore
+ */tabixpp/perl
+ */tabixpp/python
+ */tabixpp/Makefile
+ */tabixpp/*.java
+ */tabixpp/example*
+ */tabixpp/*.py
+ */tabixpp/*.tex
+ */pluginMedia/configs
+ */pluginMedia/hotspots
+ */parameter_sets/4_*
+ */parameter_sets/builtin_parameter_sets.json
+ */parameter_sets/parameter_sets.json
+ */parameter_sets/*.py
Files: *
Copyright: 2010-2014 Ion Torrent Systems, Inc.
+License: GPL-2+
+
+Files: external/freebayes/*
+Copyright: 2010 Erik Garrison, Gabor Marth
+License: MIT
+
+Files: external/vcflib/*
+Copyright: 2008-2009 Genome Research Ltd (GRL)
+ 2008-2010 Broad Institute / Massachusetts Institute of Technology
+ 2010-2012 Erik Garrison
+License: MIT
+
+Files: debian/*
+Copyright: 2015 Dominique Belhachemi <domibel at debian.org>
+ 2017-2018 Andreas Tille <tille at debian.org>
+License: GPL-2+
+
License: GPL-2+
This package is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,39 +48,9 @@ License: GPL-2+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>
- .
On Debian systems, the complete text of the GNU General
Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
-
-Files: external/freebayes/*
-Copyright: 2010 Erik Garrison, Gabor Marth
-License: MIT
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- .
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- .
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
-
-
-Files: external/vcflib/*
-Copyright: 2008-2009 Genome Research Ltd (GRL)
- 2008-2010 Broad Institute / Massachusetts Institute of Technology
- 2010-2012 Erik Garrison
License: MIT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -67,23 +69,3 @@ License: MIT
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
-
-
-Files: debian/*
-Copyright: 2015 Dominique Belhachemi <domibel at debian.org>
-License: GPL-2+
- This package is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- .
- This package is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- .
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>
- .
- On Debian systems, the complete text of the GNU General
- Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
=====================================
debian/lintian-overrides
=====================================
@@ -0,0 +1,2 @@
+# see https://lists.debian.org/debian-med/2018/06/msg00043.html
+tvc: script-with-language-extension usr/bin/variant_caller_pipeline.py
=====================================
debian/patches/series
=====================================
@@ -3,3 +3,5 @@ bamtools.patch
help.patch
hurd.patch
sse.patch
+use_debian_packaged_smithwaterman.patch
+# use_debian_packaged_vcflib.patch
=====================================
debian/patches/use_debian_packaged_smithwaterman.patch
=====================================
@@ -0,0 +1,42 @@
+Description: Use Debian packaged libsmithwaterman
+Author: Andreas Tille <tille at debian.org>
+Last-Update: Thu, 27 Sep 2018 08:59:37 +0200
+
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -85,10 +85,6 @@ set(tvcSRCS
+ ${ION_VCFLIB_DIR}/tabixpp/tabix.cpp
+ ${ION_VCFLIB_DIR}/tabixpp/index.c
+ ${ION_VCFLIB_DIR}/tabixpp/bgzf.c
+- ${ION_VCFLIB_DIR}/smithwaterman/LeftAlign.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/Repeats.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/IndelAllele.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/SmithWatermanGotoh.cpp
+ ${ION_FREEBAYES_DIR}/src/AlleleParser.cpp
+
+ BaseCaller/PIDloop.cpp
+@@ -108,7 +104,7 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL
+ endif()
+
+ add_executable(tvc ${tvcSRCS})
+-target_link_libraries(tvc bamtools z pthread blas lapack armadillo jsoncpp)
++target_link_libraries(tvc bamtools z pthread blas lapack armadillo jsoncpp disorder smithwaterman)
+
+
+ add_executable(tvcassembly
+@@ -135,14 +131,10 @@ add_executable(tvcutils
+ ${ION_VCFLIB_DIR}/tabixpp/tabix.cpp
+ ${ION_VCFLIB_DIR}/tabixpp/index.c
+ ${ION_VCFLIB_DIR}/tabixpp/bgzf.c
+- ${ION_VCFLIB_DIR}/smithwaterman/LeftAlign.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/Repeats.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/IndelAllele.cpp
+- ${ION_VCFLIB_DIR}/smithwaterman/SmithWatermanGotoh.cpp
+ ${PROJECT_BINARY_DIR}/IonVersion.cpp
+ )
+
+-target_link_libraries(tvcutils bamtools z jsoncpp)
++target_link_libraries(tvcutils bamtools z jsoncpp disorder smithwaterman)
+
+ install(TARGETS tvc DESTINATION bin)
+ install(TARGETS tvcassembly DESTINATION bin)
=====================================
debian/rules
=====================================
@@ -6,7 +6,7 @@ DPKG_EXPORT_BUILDFLAGS = 1
include /usr/share/dpkg/default.mk
# see FEATURE AREAS in dpkg-buildflags(1)
-#export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+export DEB_BUILD_MAINT_OPTIONS = hardening=+all
# see ENVIRONMENT in dpkg-buildflags(1)
# package maintainers to append CFLAGS
=====================================
debian/watch
=====================================
@@ -0,0 +1,7 @@
+version=4
+
+opts="mode=git,pretty=5.0.3+git%cd.%h,repacksuffix=+dfsg,dversionmangle=s/\+dfsg//g,repack,compression=xz" \
+ https://github.com/domibel/IonTorrent-VariantCaller.git HEAD
+
+# This is not really maintained legacy code but useful anyway
+# It is not to be expected that upstream will set any tags
=====================================
external/vcflib/smithwaterman/BandedSmithWaterman.cpp deleted
=====================================
@@ -1,670 +0,0 @@
-#include "BandedSmithWaterman.h"
-
-// define our static constants
-const float CBandedSmithWaterman::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
-
-const DirectionType CBandedSmithWaterman::Directions_STOP = 0;
-const DirectionType CBandedSmithWaterman::Directions_LEFT = 1;
-const DirectionType CBandedSmithWaterman::Directions_DIAGONAL = 2;
-const DirectionType CBandedSmithWaterman::Directions_UP = 3;
-
-const PositionType CBandedSmithWaterman::Position_REF_AND_QUERY_ZERO = 0;
-const PositionType CBandedSmithWaterman::Position_REF_ZERO = 1;
-const PositionType CBandedSmithWaterman::Position_QUERY_ZERO = 2;
-const PositionType CBandedSmithWaterman::Position_REF_AND_QUERO_NONZERO = 3;
-
-// constructor
-CBandedSmithWaterman::CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth)
-: mCurrentMatrixSize(0)
-, mCurrentAnchorSize(0)
-, mCurrentAQSumSize(0)
-, mBandwidth(bandWidth)
-, mPointers(NULL)
-, mMatchScore(matchScore)
-, mMismatchScore(mismatchScore)
-, mGapOpenPenalty(gapOpenPenalty)
-, mGapExtendPenalty(gapExtendPenalty)
-, mAnchorGapScores(NULL)
-, mBestScores(NULL)
-, mReversedAnchor(NULL)
-, mReversedQuery(NULL)
-, mUseHomoPolymerGapOpenPenalty(false)
-{
- CreateScoringMatrix();
-
- //if((bandWidth % 2) != 1) {
- //printf("ERROR: The bandwidth must be an odd number.\n");
- //exit(1);
- //}
-
- try {
- mBestScores = new float[bandWidth + 2];
- mAnchorGapScores = new float[bandWidth + 2];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
-}
-
-// destructor
-CBandedSmithWaterman::~CBandedSmithWaterman(void) {
- if(mPointers) delete [] mPointers;
- if(mAnchorGapScores) delete [] mAnchorGapScores;
- if(mBestScores) delete [] mBestScores;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-}
-
-// aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
-void CBandedSmithWaterman::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr) {
-
-
-
- unsigned int rowStart = min(hr.first.first, (unsigned int)hr.second.first);
- hr.first.first -= rowStart;
- hr.second.first -= rowStart;
-
- //bool isLegalBandWidth = (s2.length() - hr.QueryBegin) > (mBandwidth / 2);
- // isLegalBandWidth = isLegalBandWidth && ((s1.length() - hr.Begin) > (mBandwidth / 2));
-
-
-
- // check the lengths of the input sequences
- //if( (s1.length() <= 0) || (s2.length() <= 0) || (s1.length() < s2.length()) ) {
- // printf("ERROR: An unexpected sequence length was encountered during pairwise alignment.\n");
- // printf("Sequence lengths are listed as following:\n");
- // printf("1. Reference length: %u\n2. Query length: %u\n", s1.length(), s2.length());
- //printf("3. Hash region in reference:%4u-%4u\n", hr.Begin + rowStart, hr.End);
- //printf("4. Hash region in query: %4u-%4u\n", hr.QueryBegin + rowStart, hr.QueryEnd);
- // exit(1);
- //}
-
-
- // determine the hash region type
- unsigned int rowOffset;
- unsigned int columnOffset;
- PositionType positionType;
-
- if(hr.first.first == 0) {
- if(hr.second.first == 0) {
- rowOffset = 1;
- columnOffset = (mBandwidth / 2) + 1;
- positionType = Position_REF_AND_QUERY_ZERO;
- } else {
- rowOffset = 1 - hr.second.first;
- columnOffset = (mBandwidth / 2) + 1 + hr.second.first;
- positionType = Position_REF_ZERO;
- }
- } else {
- if(hr.second.first == 0) {
- rowOffset = 1;
- columnOffset = (mBandwidth / 2) + 1 - hr.first.first;
- positionType = Position_QUERY_ZERO;
- } else {
- rowOffset = 1 - hr.second.first;
- columnOffset = (mBandwidth / 2) + 1 + hr.second.first - hr.first.first;
- positionType = Position_REF_AND_QUERO_NONZERO;
- }
- }
-
- // =========================
- // Reinitialize the matrices
- // =========================
-
- ReinitializeMatrices(positionType, s1.length(), s2.length(), hr);
-
- // =======================================
- // Banded Smith-Waterman forward algorithm
- // =======================================
-
- unsigned int bestColumn = 0;
- unsigned int bestRow = 0;
- float bestScore = FLOAT_NEGATIVE_INFINITY;
- float currentQueryGapScore;
-
- // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively
- unsigned int rowNum = hr.second.first;
- unsigned int columnNum = hr.first.first;
-
- // indicates how many rows including blank elements in the Banded SmithWaterman
- int numBlankElements = (mBandwidth / 2) - columnNum;
-
- //cout << numBlankElements << endl;
- // upper triangle matrix in Banded Smith-Waterman
- for( ; numBlankElements > 0; numBlankElements--, rowNum++){
- // in the upper triangle matrix, we always start at the 0th column
- columnNum = 0;
-
- // columnEnd indicates how many columns which should be dealt with in the current row
- unsigned int columnEnd = min((mBandwidth - numBlankElements), ((unsigned int) s1.length() - columnNum + 1) );
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
- for( unsigned int j = 0; j < columnEnd; j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- columnNum = columnNum - (mBandwidth / 2);
- }
- // complete matrix in Banded Smith-Waterman
- unsigned int completeNum = min((s1.length() - columnNum - (mBandwidth / 2)), (s2.length() - rowNum));
- //cout << completeNum << endl;
- for(unsigned int i = 0; i < completeNum; i++, rowNum++){
- columnNum = columnNum - (mBandwidth / 2);
-
- // there are mBandwidth columns which should be dealt with in each row
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
-
- for(unsigned int j = 0; j < mBandwidth; j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1).
- columnNum = columnNum - (mBandwidth / 2);
- }
-
- // lower triangle matrix
- numBlankElements = min(mBandwidth, ((unsigned int) s2.length() - rowNum));
- columnNum = columnNum - (mBandwidth / 2);
- for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) {
-
- mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;;
- // columnEnd indicates how many columns which should be dealt with
- currentQueryGapScore = FLOAT_NEGATIVE_INFINITY;
-
- for( unsigned int j = columnNum; j < s1.length(); j++){
- float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset);
- UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score);
- //cout << s1[columnNum] << s2[rowNum] << score << endl;
- columnNum++;
- }
-
- // replace the columnNum to the middle column in the Smith-Waterman matrix
- columnNum = columnNum - mBandwidth + i + 2;
- }
-
- // =========================================
- // Banded Smith-Waterman backtrace algorithm
- // =========================================
-
- Traceback(referenceAl, cigarAl, s1, s2, bestRow, bestColumn, rowOffset, columnOffset);
-
-}
-
-// calculates the score during the forward algorithm
-float CBandedSmithWaterman::CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) {
-
- // initialize
- const unsigned int row = rowNum + rowOffset;
- const unsigned int column = columnOffset - rowNum + columnNum;
- const unsigned int position = row * (mBandwidth + 2) + column;
-
- // retrieve the similarity scores
- const float similarityScore = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A'];
- const float totalSimilarityScore = mBestScores[column] + similarityScore;
-
- // ================================
- // open a gap in the query sequence
- // ================================
-
- float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty;
- float queryGapOpenScore = mBestScores[column - 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1]))
- queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty;
-
- if(queryGapExtendScore > queryGapOpenScore) {
- currentQueryGapScore = queryGapExtendScore;
- mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1;
- } else currentQueryGapScore = queryGapOpenScore;
-
-
- // ====================================
- // open a gap in the reference sequence
- // ====================================
-
-
- float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty;
- float anchorGapOpenScore = mBestScores[column + 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1]))
- anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty;
-
- if(anchorGapExtendScore > anchorGapOpenScore) {
- mAnchorGapScores[column] = anchorGapExtendScore;
- mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1;
- } else mAnchorGapScores[column] = anchorGapOpenScore;
-
- // ======================================
- // calculate the best score and direction
- // ======================================
-
- //mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore);
- mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]);
-
- // determine the traceback direction
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- if(mBestScores[column] == 0) mPointers[position].Direction = Directions_STOP;
- else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP;
- else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT;
- else mPointers[position].Direction = Directions_DIAGONAL;
-
- return mBestScores[column];
-}
-
-// corrects the homopolymer gap order for forward alignments
-void CBandedSmithWaterman::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
-
-
- // this is only required for alignments with mismatches
- //if(al.NumMismatches == 0) return;
- if ( numMismatches == 0 ) return;
-
- // localize the alignment data
- //char* pReference = al.Reference.Data();
- //char* pQuery = al.Query.Data();
- //const unsigned int numBases = al.Reference.Length();
- char* pReference = mReversedAnchor;
- char* pQuery = mReversedQuery;
-
- // initialize
- bool hasReferenceGap = false, hasQueryGap = false;
- char* pNonGapSeq = NULL;
- char* pGapSeq = NULL;
- char nonGapBase = 'J';
-
- // identify gapped regions
- for(unsigned int i = 0; i < numBases; i++) {
-
- // check if the current position is gapped
- hasReferenceGap = false;
- hasQueryGap = false;
-
- if(pReference[i] == GAP) {
- hasReferenceGap = true;
- pNonGapSeq = pQuery;
- pGapSeq = pReference;
- nonGapBase = pQuery[i];
- }
-
- if(pQuery[i] == GAP) {
- hasQueryGap = true;
- pNonGapSeq = pReference;
- pGapSeq = pQuery;
- nonGapBase = pReference[i];
- }
-
- // continue if we don't have any gaps
- if(!hasReferenceGap && !hasQueryGap) continue;
-
- // sanity check
- if(hasReferenceGap && hasQueryGap) {
- printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
- exit(1);
- }
-
- // find the non-gapped length (forward)
- unsigned short numGappedBases = 0;
- unsigned short nonGapLength = 0;
- unsigned short testPos = i;
- while(testPos < numBases) {
-
- const char gs = pGapSeq[testPos];
- const char ngs = pNonGapSeq[testPos];
-
- bool isPartofHomopolymer = false;
- if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
- if(!isPartofHomopolymer) break;
-
- if(gs == GAP) numGappedBases++;
- else nonGapLength++;
- testPos++;
- }
-
- // fix the gap order
- if(numGappedBases != 0) {
- char* pCurrentSequence = pGapSeq + i;
- memset(pCurrentSequence, nonGapBase, nonGapLength);
- pCurrentSequence += nonGapLength;
- memset(pCurrentSequence, GAP, numGappedBases);
- }
-
- // increment
- i += numGappedBases + nonGapLength - 1;
- }
-}
-
-// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
-void CBandedSmithWaterman::CreateScoringMatrix(void) {
-
- unsigned int nIndex = 13;
- unsigned int xIndex = 23;
-
- // define the N score to be 1/4 of the span between mismatch and match
- //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
-
- // calculate the scoring matrix
- for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
- for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
-
- // N.B. matching N to everything (while conceptually correct) leads to some
- // bad alignments, lets make N be a mismatch instead.
-
- // add the matches or mismatches to the hashtable (N is a mismatch)
- if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if(i == j) mScoringMatrix[i][j] = mMatchScore;
- else mScoringMatrix[i][j] = mMismatchScore;
- }
- }
-
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
- mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
-
- mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
- mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
- mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
- mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
-
- mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
- mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
- mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
- mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
-
- mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
- mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
- mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
- mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
-
- mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
- mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
- mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
- mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
-
- mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
- mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
- mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
- mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
-
- mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
- mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
- mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
- mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
-
- mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
- mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
- mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
- mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
-
- mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
- mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
- mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
- mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
-
- mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
- mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
- mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
- mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
-}
-
-// enables homo-polymer scoring
-void CBandedSmithWaterman::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
- mUseHomoPolymerGapOpenPenalty = true;
- mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
-}
-
-// reinitializes the matrices
-void CBandedSmithWaterman::ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr) {
-
-/*
- try {
- mBestScores = new float[mBandwidth + 2];
- mAnchorGapScores = new float[mBandwidth + 2];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
-*/
-
- const unsigned int numColumns = mBandwidth + 2;
- unsigned int numRows = 0;
-
- switch(positionType) {
- case Position_REF_AND_QUERY_ZERO:
- numRows = s2Length + 1;
- break;
- case Position_REF_ZERO:
- numRows = s2Length - hr.second.first + 2;
- break;
- case Position_QUERY_ZERO:
- numRows = min(s2Length + 1, s1Length - hr.first.first + 2);
- break;
- case Position_REF_AND_QUERO_NONZERO:
- numRows = min(s1Length - hr.first.first + 2, s2Length - hr.second.first + 2);
- break;
- }
-
- // update the size of the backtrace matrix
- if((numColumns * numRows) > mCurrentMatrixSize) {
-
- mCurrentMatrixSize = numColumns * numRows;
- if(mPointers) delete [] mPointers;
-
- try {
- mPointers = new ElementInfo[mCurrentMatrixSize];
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
- }
-
- // initialize our backtrace matrix
- ElementInfo defaultElement;
- defaultElement.Direction = Directions_STOP;
- defaultElement.mSizeOfHorizontalGaps = 1;
- defaultElement.mSizeOfVerticalGaps = 1;
-
- uninitialized_fill(mPointers, mPointers + mCurrentMatrixSize, defaultElement);
-
- // update the sequence character arrays
- if((s1Length + s2Length) > mCurrentAQSumSize) {
-
- mCurrentAQSumSize = s1Length + s2Length;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-
- try {
- mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
- mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
- } catch(bad_alloc) {
- printf("ERROR: Unable to allocate enough memory for the banded Smith-Waterman algorithm.\n");
- exit(1);
- }
- }
-
- // initialize the gap score and score vectors
- uninitialized_fill(mAnchorGapScores, mAnchorGapScores + mBandwidth + 2, FLOAT_NEGATIVE_INFINITY);
- memset((char*)mBestScores, 0, SIZEOF_FLOAT * (mBandwidth + 2));
- mBestScores[0] = FLOAT_NEGATIVE_INFINITY;
- mBestScores[mBandwidth + 1] = FLOAT_NEGATIVE_INFINITY;
-}
-
-// performs the backtrace algorithm
-void CBandedSmithWaterman::Traceback(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset){
-
-
- unsigned int currentRow = bestRow;
- unsigned int currentColumn = bestColumn;
- unsigned int currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
-
-
- // record the numbers of row and column before the current row and column
- unsigned int previousRow = bestRow;
- unsigned int previousColumn = bestColumn;
-
- unsigned int gappedAnchorLen = 0;
- unsigned int gappedQueryLen = 0;
- unsigned int numMismatches = 0;
-
- bool keepProcessing = true;
- while(keepProcessing) {
- unsigned int nVerticalGap = 0;
- unsigned int nHorizontalGap = 0;
- switch(mPointers[currentPosition].Direction){
- case Directions_DIAGONAL:
- nVerticalGap = mPointers[currentPosition].mSizeOfVerticalGaps;
- for(unsigned int i = 0; i < nVerticalGap; i++){
- mReversedAnchor[gappedAnchorLen++] = GAP;
- mReversedQuery[gappedQueryLen++] = s2[currentRow];
-
- numMismatches++;
-
- previousRow = currentRow;
- previousColumn = currentColumn;
-
- currentRow--;
- }
- break;
-
- case Directions_STOP:
- keepProcessing = false;
- //mReversedAnchor[gappedAnchorLen+1]='\0';
- //mReversedQuery [gappedQueryLen+1]='\0';
- break;
-
- case Directions_UP:
-
- mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
- mReversedQuery[gappedQueryLen++] = s2[currentRow];
-
- if(s1[currentColumn] != s2[currentRow]) numMismatches++;
- previousRow = currentRow;
- previousColumn = currentColumn;
-
- currentRow--;
- currentColumn--;
- break;
-
- case Directions_LEFT:
- nHorizontalGap = mPointers[currentPosition].mSizeOfHorizontalGaps;
- for(unsigned int i = 0; i < nHorizontalGap; i++){
-
- mReversedAnchor[gappedAnchorLen++] = s1[currentColumn];
- mReversedQuery[gappedQueryLen++] = GAP;
-
- numMismatches++;
-
- previousRow = currentRow;
- previousColumn = currentColumn;
-
-
- currentColumn--;
- }
- break;
- }
- currentPosition = ((currentRow + rowOffset) * (mBandwidth + 2)) + (columnOffset - currentRow + currentColumn);
- }
-
- // correct the reference and query sequence order
- mReversedAnchor[gappedAnchorLen] = 0;
- mReversedQuery [gappedQueryLen] = 0;
- reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
- reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
-
- //alignment.Reference = mReversedAnchor;
- //alignment.Query = mReversedQuery;
-
- // assign the alignment endpoints
- //alignment.ReferenceBegin = previousColumn;
- //alignment.ReferenceEnd = bestColumn;
- referenceAl = previousColumn;
- /*
- if(alignment.IsReverseComplement){
- alignment.QueryBegin = s2.length() - bestRow - 1;
- alignment.QueryEnd = s2.length() - previousRow - 1;
- } else {
- alignment.QueryBegin = previousRow;
- alignment.QueryEnd = bestRow;
- }
- */
-
- //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
- //alignment.NumMismatches = numMismatches;
-
- const unsigned int alLength = strlen(mReversedAnchor);
- unsigned int m = 0, d = 0, i = 0;
- bool dashRegion = false;
- ostringstream oCigar;
-
- if ( previousRow != 0 )
- oCigar << previousRow << 'S';
-
- for ( unsigned int j = 0; j < alLength; j++ ) {
- // m
- if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
- if ( dashRegion ) {
- if ( d != 0 ) oCigar << d << 'D';
- else oCigar << i << 'I';
- }
- dashRegion = false;
- m++;
- d = 0;
- i = 0;
- }
- // I or D
- else {
- if ( !dashRegion )
- oCigar << m << 'M';
- dashRegion = true;
- m = 0;
- if ( mReversedAnchor[j] == GAP ) {
- if ( d != 0 ) oCigar << d << 'D';
- i++;
- d = 0;
- }
- else {
- if ( i != 0 ) oCigar << i << 'I';
- d++;
- i = 0;
- }
- }
- }
-
- if ( m != 0 ) oCigar << m << 'M';
- else if ( d != 0 ) oCigar << d << 'D';
- else if ( i != 0 ) oCigar << i << 'I';
-
- if ( ( bestRow + 1 ) != s2.length() )
- oCigar << s2.length() - bestRow - 1 << 'S';
-
- cigarAl = oCigar.str();
-
-
- // correct the homopolymer gap order
- CorrectHomopolymerGapOrder(alLength, numMismatches);
-
-}
=====================================
external/vcflib/smithwaterman/BandedSmithWaterman.h deleted
=====================================
@@ -1,111 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <algorithm>
-#include <memory>
-//#include "Alignment.h"
-#include "Mosaik.h"
-//#include "HashRegion.h"
-#include <string.h>
-#include <stdio.h>
-#include <sstream>
-#include <string>
-
-using namespace std;
-
-#define MOSAIK_NUM_NUCLEOTIDES 26
-#define GAP '-'
-
-typedef unsigned char DirectionType;
-typedef unsigned char PositionType;
-
-struct ElementInfo {
- unsigned int Direction : 2;
- unsigned int mSizeOfVerticalGaps : 15;
- unsigned int mSizeOfHorizontalGaps : 15;
-};
-
-class CBandedSmithWaterman {
-public:
- // constructor
- CBandedSmithWaterman(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty, unsigned int bandWidth);
- // destructor
- ~CBandedSmithWaterman(void);
- // aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm
- void Align(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr);
- // enables homo-polymer scoring
- void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
-private:
- // calculates the score during the forward algorithm
- float CalculateScore(const string& s1, const string& s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset);
- // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
- void CreateScoringMatrix(void);
- // corrects the homopolymer gap order for forward alignments
- void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
- // returns the maximum floating point number
- static inline float MaxFloats(const float& a, const float& b, const float& c);
- // reinitializes the matrices
- void ReinitializeMatrices(const PositionType& positionType, const unsigned int& s1Length, const unsigned int& s2Length, const pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr);
- // performs the backtrace algorithm
- void Traceback(unsigned int& referenceAl, string& stringAl, const string& s1, const string& s2, unsigned int bestRow, unsigned int bestColumn, const unsigned int rowOffset, const unsigned int columnOffset);
- // updates the best score during the forward algorithm
- inline void UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score);
- // our simple scoring matrix
- float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
- // keep track of maximum initialized sizes
- unsigned int mCurrentMatrixSize;
- unsigned int mCurrentAnchorSize;
- unsigned int mCurrentAQSumSize;
- unsigned int mBandwidth;
- // define our backtrace directions
- const static DirectionType Directions_STOP;
- const static DirectionType Directions_LEFT;
- const static DirectionType Directions_DIAGONAL;
- const static DirectionType Directions_UP;
- // store the backtrace pointers
- ElementInfo* mPointers;
- // define our position types
- const static PositionType Position_REF_AND_QUERY_ZERO;
- const static PositionType Position_REF_ZERO;
- const static PositionType Position_QUERY_ZERO;
- const static PositionType Position_REF_AND_QUERO_NONZERO;
- // define scoring constants
- const float mMatchScore;
- const float mMismatchScore;
- const float mGapOpenPenalty;
- const float mGapExtendPenalty;
- // score if xi aligns to a gap after yi
- float* mAnchorGapScores;
- // best score of alignment x1...xi to y1...yi
- float* mBestScores;
- // our reversed alignment
- char* mReversedAnchor;
- char* mReversedQuery;
- // define static constants
- static const float FLOAT_NEGATIVE_INFINITY;
- // toggles the use of the homo-polymer gap open penalty
- bool mUseHomoPolymerGapOpenPenalty;
- float mHomoPolymerGapOpenPenalty;
-};
-
-// returns the maximum floating point number
-inline float CBandedSmithWaterman::MaxFloats(const float& a, const float& b, const float& c) {
- float max = 0.0f;
- if(a > max) max = a;
- if(b > max) max = b;
- if(c > max) max = c;
- return max;
-}
-
-// updates the best score during the forward algorithm
-inline void CBandedSmithWaterman::UpdateBestScore(unsigned int& bestRow, unsigned int& bestColumn, float& bestScore, const unsigned int rowNum, const unsigned int columnNum, const float score) {
-
- //const unsigned int row = rowNum + rowOffset;
- //const unsigned int column = columnOffset - rowNum + columnNum;
-
- if(score > bestScore) {
- bestRow = rowNum;
- bestColumn = columnNum;
- bestScore = score;
- }
-}
=====================================
external/vcflib/smithwaterman/IndelAllele.cpp deleted
=====================================
@@ -1,62 +0,0 @@
-#include "IndelAllele.h"
-
-using namespace std;
-
-
-bool IndelAllele::homopolymer(void) {
- string::iterator s = sequence.begin();
- char c = *s++;
- while (s != sequence.end()) {
- if (c != *s++) return false;
- }
- return true;
-}
-
-int IndelAllele::readLength(void) {
- if (insertion) {
- return length;
- } else {
- return 0;
- }
-}
-
-int IndelAllele::referenceLength(void) {
- if (insertion) {
- return 0;
- } else {
- return length;
- }
-}
-
-bool homopolymer(string sequence) {
- string::iterator s = sequence.begin();
- char c = *s++;
- while (s != sequence.end()) {
- if (c != *s++) return false;
- }
- return true;
-}
-
-ostream& operator<<(ostream& out, const IndelAllele& indel) {
- string t = indel.insertion ? "i" : "d";
- out << t << ":" << indel.position << ":" << indel.readPosition << ":" << indel.length << ":" << indel.sequence;
- return out;
-}
-
-bool operator==(const IndelAllele& a, const IndelAllele& b) {
- return (a.insertion == b.insertion
- && a.length == b.length
- && a.position == b.position
- && a.sequence == b.sequence);
-}
-
-bool operator!=(const IndelAllele& a, const IndelAllele& b) {
- return !(a==b);
-}
-
-bool operator<(const IndelAllele& a, const IndelAllele& b) {
- ostringstream as, bs;
- as << a;
- bs << b;
- return as.str() < bs.str();
-}
=====================================
external/vcflib/smithwaterman/IndelAllele.h deleted
=====================================
@@ -1,37 +0,0 @@
-#ifndef __INDEL_ALLELE_H
-#define __INDEL_ALLELE_H
-
-#include <string>
-#include <iostream>
-#include <sstream>
-
-using namespace std;
-
-class IndelAllele {
- friend ostream& operator<<(ostream&, const IndelAllele&);
- friend bool operator==(const IndelAllele&, const IndelAllele&);
- friend bool operator!=(const IndelAllele&, const IndelAllele&);
- friend bool operator<(const IndelAllele&, const IndelAllele&);
-public:
- bool insertion;
- int length;
- int referenceLength(void);
- int readLength(void);
- int position;
- int readPosition;
- string sequence;
-
- bool homopolymer(void);
-
- IndelAllele(bool i, int l, int p, int rp, string s)
- : insertion(i), length(l), position(p), readPosition(rp), sequence(s)
- { }
-};
-
-bool homopolymer(string sequence);
-ostream& operator<<(ostream& out, const IndelAllele& indel);
-bool operator==(const IndelAllele& a, const IndelAllele& b);
-bool operator!=(const IndelAllele& a, const IndelAllele& b);
-bool operator<(const IndelAllele& a, const IndelAllele& b);
-
-#endif
=====================================
external/vcflib/smithwaterman/LeftAlign.cpp deleted
=====================================
@@ -1,853 +0,0 @@
-#include "LeftAlign.h"
-
-//bool debug;
-#define VERBOSE_DEBUG
-
-// Attempts to left-realign all the indels represented by the alignment cigar.
-//
-// This is done by shifting all indels as far left as they can go without
-// mismatch, then merging neighboring indels of the same class. leftAlign
-// updates the alignment cigar with changes, and returns true if realignment
-// changed the alignment cigar.
-//
-// To left-align, we move multi-base indels left by their own length as long as
-// the preceding bases match the inserted or deleted sequence. After this
-// step, we handle multi-base homopolymer indels by shifting them one base to
-// the left until they mismatch the reference.
-//
-// To merge neighboring indels, we iterate through the set of left-stabilized
-// indels. For each indel we add a new cigar element to the new cigar. If a
-// deletion follows a deletion, or an insertion occurs at the same place as
-// another insertion, we merge the events by extending the previous cigar
-// element.
-//
-// In practice, we must call this function until the alignment is stabilized.
-//
-bool leftAlign(string& querySequence, string& cigar, string& baseReferenceSequence, int& offset, bool debug) {
-
- debug = false;
-
- string referenceSequence = baseReferenceSequence.substr(offset);
-
- int arsOffset = 0; // pointer to insertion point in aligned reference sequence
- string alignedReferenceSequence, alignedQuerySequence;
- if (debug) alignedReferenceSequence = referenceSequence;
- if (debug) alignedQuerySequence = querySequence;
- int aabOffset = 0;
-
- // store information about the indels
- vector<IndelAllele> indels;
-
- int rp = 0; // read position, 0-based relative to read
- int sp = 0; // sequence position
-
- string softBegin;
- string softEnd;
-
- string cigarbefore = cigar;
-
- vector<pair<int, string> > cigarData = splitCIGAR(cigar);
- for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
- c != cigarData.end(); ++c) {
- unsigned int l = c->first;
- string t = c->second;
- if (debug) cerr << l << t << " " << sp << " " << rp << endl;
- if (t == "M") { // match or mismatch
- sp += l;
- rp += l;
- } else if (t == "D") { // deletion
- indels.push_back(IndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
- if (debug) { cerr << indels.back() << endl; alignedQuerySequence.insert(rp + aabOffset, string(l, '-')); }
- aabOffset += l;
- sp += l; // update reference sequence position
- } else if (t == "I") { // insertion
- indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- if (debug) { cerr << indels.back() << endl; alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-')); }
- arsOffset += l;
- rp += l;
- } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
- // remove these bases from the refseq and read seq, but don't modify the alignment sequence
- if (rp == 0) {
- alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
- //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- softBegin = querySequence.substr(0, l);
- } else {
- alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
- //indels.push_back(IndelAllele(true, l, sp, rp, querySequence.substr(rp, l)));
- softEnd = querySequence.substr(querySequence.size() - l, l);
- }
- rp += l;
- } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
- } else if (t == "N") { // skipped region in the reference not present in read, aka splice
- sp += l;
- }
- }
-
-
- if (debug) cerr << "| " << cigarbefore << endl
- << "| " << alignedReferenceSequence << endl
- << "| " << alignedQuerySequence << endl;
-
- // if no indels, return the alignment
- if (indels.empty()) { return false; }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- // for each indel, from left to right
- // while the indel sequence repeated to the left and we're not matched up with the left-previous indel
- // move the indel left
-
- vector<IndelAllele>::iterator previous = indels.begin();
- for (vector<IndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {
-
- // left shift by repeats
- //
- // from 1 base to the length of the indel, attempt to shift left
- // if the move would cause no change in alignment optimality (no
- // introduction of mismatches, and by definition no change in gap
- // length), move to the new position.
- // in practice this moves the indel left when we reach the size of
- // the repeat unit.
- //
- int steppos, readsteppos;
- IndelAllele& indel = *id;
- int i = 1;
-
- while (i <= indel.length) {
-
- int steppos = indel.position - i;
- int readsteppos = indel.readPosition - i;
-
- if (debug) {
- if (steppos >= 0 && readsteppos >= 0) {
- cerr << "refseq flank " << referenceSequence.substr(steppos, indel.length) << endl;
- cerr << "qryseq flank " << querySequence.substr(readsteppos, indel.length) << endl;
- cerr << "indelseq " << indel.sequence << endl;
- }
- }
-
- while (steppos >= 0 && readsteppos >= 0
- && indel.sequence == referenceSequence.substr(steppos, indel.length)
- && indel.sequence == querySequence.substr(readsteppos, indel.length)
- && (id == indels.begin()
- || (previous->insertion && steppos >= previous->position)
- || (!previous->insertion && steppos >= previous->position + previous->length))) {
- LEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl);
- indel.position -= i;
- indel.readPosition -= i;
- steppos = indel.position - i;
- readsteppos = indel.readPosition - i;
- }
- do {
- ++i;
- } while (i <= indel.length && indel.length % i != 0);
- }
-
-
-
- // left shift indels with exchangeable flanking sequence
- //
- // for example:
- //
- // GTTACGTT GTTACGTT
- // GT-----T ----> G-----TT
- //
- // GTGTGACGTGT GTGTGACGTGT
- // GTGTG-----T ----> GTG-----TGT
- //
- // GTGTG-----T GTG-----TGT
- // GTGTGACGTGT ----> GTGTGACGTGT
- //
- //
-
- steppos = indel.position - 1;
- readsteppos = indel.readPosition - 1;
- while (steppos >= 0 && readsteppos >= 0
- && querySequence.at(readsteppos) == referenceSequence.at(steppos)
- && (int)referenceSequence.size() > steppos + indel.length
- && indel.sequence.at((int) indel.sequence.size() - 1) == referenceSequence.at(steppos + indel.length) // are the exchanged bases going to match wrt. the reference?
- && querySequence.at(readsteppos) == indel.sequence.at((int) indel.sequence.size() - 1)
- && (id == indels.begin()
- || (previous->insertion && indel.position - 1 >= previous->position)
- || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
- if (debug) cerr << (indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl;
- indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
- indel.position -= 1;
- indel.readPosition -= 1;
- if (debug) cerr << indel << endl;
- steppos = indel.position - 1;
- readsteppos = indel.readPosition - 1;
- //if (debug && steppos && readsteppos) cerr << querySequence.at(readsteppos) << " ==? " << referenceSequence.at(steppos) << endl;
- //if (debug && steppos && readsteppos) cerr << indel.sequence.at((int) indel.sequence.size() - 1) << " ==? " << referenceSequence.at(steppos + indel.length) << endl;
- }
- // tracks previous indel, so we don't run into it with the next shift
- previous = id;
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- if (debug) cerr << "bring together floating indels" << endl;
-
- // bring together floating indels
- // from left to right
- // check if we could merge with the next indel
- // if so, adjust so that we will merge in the next step
- if (indels.size() > 1) {
- previous = indels.begin();
- for (vector<IndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
- IndelAllele& indel = *id;
- // parsimony: could we shift right and merge with the previous indel?
- // if so, do it
- int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
- int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
- if (previous->insertion == indel.insertion
- && ((previous->insertion
- && (previous->position < indel.position
- && previous->readPosition < indel.readPosition))
- ||
- (!previous->insertion
- && (previous->position + previous->length < indel.position)
- && (previous->readPosition < indel.readPosition)
- ))) {
- if (previous->homopolymer()) {
- string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
- string readseq = querySequence.substr(prev_end_read, indel.position - prev_end_ref);
- if (debug) cerr << "seq: " << seq << endl << "readseq: " << readseq << endl;
- if (previous->sequence.at(0) == seq.at(0)
- && homopolymer(seq)
- && homopolymer(readseq)) {
- if (debug) cerr << "moving " << *previous << " right to "
- << (indel.insertion ? indel.position : indel.position - previous->length) << endl;
- previous->position = indel.insertion ? indel.position : indel.position - previous->length;
- previous->readPosition = !indel.insertion ? indel.readPosition : indel.readPosition - previous->readLength(); // should this be readLength?
- }
- }
- /*
- else {
- int pos = previous->position;
- int readpos = previous->readPosition;
- while (pos < (int) referenceSequence.length() &&
- ((previous->insertion && pos + previous->length <= indel.position)
- ||
- (!previous->insertion && pos + previous->length < indel.position))
- && previous->sequence == referenceSequence.substr(pos + previous->length, previous->length)
- && previous->sequence == querySequence.substr(readpos + previous->length, previous->length)
- ) {
- pos += previous->length;
- readpos += previous->length;
- }
- string seq = previous->sequence;
- if (pos > previous->position) {
- // wobble bases right to left as far as we can go
- int steppos = previous->position + seq.size();
- int readsteppos = previous->readPosition + seq.size();
-
- while (querySequence.at(readsteppos) == referenceSequence.at(steppos)
- && querySequence.at(readsteppos) == seq.at(0)
- && (id == indels.begin()
- || (indel.insertion && pos + seq.size() - 1 <= indel.position)
- || (!previous->insertion && indel.position - 1 >= pos + previous->length))) {
- seq = seq.substr(1) + seq.at(0);
- ++pos;
- ++readpos;
- steppos = pos + 1;
- readsteppos = readpos + 1;
- }
-
- if (((previous->insertion && pos + previous->length == indel.position)
- ||
- (!previous->insertion && pos == indel.position - previous->length))
- ) {
- if (debug) cerr << "right-merging tandem repeat: moving " << *previous << " right to " << pos << endl;
- previous->position = pos;
- previous->readPosition = readpos;
- previous->sequence = seq;
- }
- }
- }
- */
- }
- previous = id;
- }
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
-
- if (debug) cerr << "bring in indels at ends of read" << endl;
-
- // try to "bring in" repeat indels at the end, for maximum parsimony
- //
- // e.g.
- //
- // AGAAAGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
- // AGAAAG------AAAGAAAAAGAACCAAGAAGAAAA
- //
- // has no information which distinguishes it from:
- //
- // AGAAAGAAAAAGAAAAAGAACCAAGAAGAAAA
- // AGAAAG--AAAGAAAAAGAACCAAGAAGAAAA
- //
- // here we take the parsimonious explanation
-
- if (!indels.empty()) {
- // deal with the first indel
- // the first deletion ... or the biggest deletion
- vector<IndelAllele>::iterator a = indels.begin();
- vector<IndelAllele>::iterator del = indels.begin();
- for (; a != indels.end(); ++a) {
- //if (!a->insertion && a->length > biggestDel->length) biggestDel = a;
- if (!a->insertion && a->length) del = a;
- if (!del->insertion) {
- //if (!indel.insertion) { // only handle deletions like this for now
- //if (!indel.insertion && !(indels.size() > 1 && indel.readPosition == indels.at(1).readPosition)) { // only handle deletions like this for now
- int insertedBpBefore = 0;
- int deletedBpBefore = 0;
- for (vector<IndelAllele>::iterator i = indels.begin(); i != del; ++i) {
- if (i->insertion) insertedBpBefore += i->length;
- else deletedBpBefore += i->length;
- }
- IndelAllele& indel = *del;
- int minsize = indel.length;
- int flankingLength = indel.readPosition;
- if (debug) cerr << indel << endl;
- string flanking = querySequence.substr(0, flankingLength);
- if (debug) cerr << flanking << endl;
-
- size_t p = referenceSequence.substr(0, indel.position + indel.length).rfind(flanking);
- if (p == string::npos) {
- if (debug) cerr << "flanking not found" << endl;
- } else {
- if (debug) {
- cerr << "flanking is at " << p << endl;
- cerr << "minsize would be " << (indel.position + indel.length) - ((int) p + flankingLength) << endl;
- }
- minsize = (indel.position + indel.length) - ((int) p + flankingLength);
- }
-
- if (debug) cerr << minsize << endl;
-
- if (minsize >= 0 && minsize < indel.length) {
-
- int softdiff = softBegin.size();
- if (!softBegin.empty()) { // remove soft clips if we can
- if (flankingLength < (int)softBegin.size()) {
- softBegin = softBegin.substr(0, flankingLength - softBegin.size());
- softdiff -= softBegin.size();
- } else {
- softBegin.clear();
- }
- }
-
- // the new read position == the current read position
- // the new reference position == the flanking length size
- // the positional offset of the reference sequence == the new position of the deletion - the flanking length
-
- int diff = indel.length - minsize - softdiff + deletedBpBefore - insertedBpBefore;
- //int querydiff = indel.length - minsize - softBegin.size() - insertedBpBefore + deletedBpBefore;
- if (debug) cerr << "adjusting " << indel.length <<" " << minsize << " " << softdiff << " " << diff << endl;
- offset += diff;
- ///
- indel.length = minsize;
- indel.sequence = indel.sequence.substr(indel.sequence.size() - minsize);
- indel.position = flankingLength;
- indel.readPosition = indel.position; // if we have removed all the sequence before, this should be ==
- referenceSequence = referenceSequence.substr(diff);
-
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i < del) {
- i->length = 0; // remove
- } else if (i > del) {
- i->position -= diff;
- }
- }
- }
- if (debug) cerr << indel << endl;
-
- // now, do the same for the reverse
- if (indel.length > 0) {
- int minsize = indel.length + 1;
- int flankingLength = querySequence.size() - indel.readPosition + indel.readLength();
- string flanking = querySequence.substr(indel.readPosition + indel.readLength(), flankingLength);
- //int indelRefEnd = indel.position + indel.referenceLength();
-
- size_t p = referenceSequence.find(flanking, indel.position);
- if (p == string::npos) {
- if (debug)
- cerr << "flanking not found" << endl;
- } else {
- if (debug) {
- cerr << "flanking is at " << p << endl;
- cerr << "minsize would be " << (int) p - indel.position << endl;
- }
- minsize = (int) p - indel.position;
- }
-
- if (debug) cerr << "minsize " << minsize << endl;
- if (minsize >= 0 && minsize <= indel.length) {
- //referenceSequence = referenceSequence.substr(0, referenceSequence.size() - (indel.length - minsize));
- if (debug) cerr << "adjusting " << indel << endl;
- indel.length = minsize;
- indel.sequence = indel.sequence.substr(0, minsize);
- //cerr << indel << endl;
- if (!softEnd.empty()) { // remove soft clips if we can
- if (flankingLength < (int)softEnd.size()) {
- softEnd = softEnd.substr(flankingLength - softEnd.size());
- } else {
- softEnd.clear();
- }
- }
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i > del) {
- i->length = 0; // remove
- }
- }
- }
- }
- }
- }
- }
-
- if (debug) {
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- }
-
- if (debug) cerr << "parsing indels" << endl;
-
- // if soft clipping can be reduced by adjusting the tailing indels in the read, do it
- // TODO
-
- /*
- int numEmptyIndels = 0;
-
- if (!indels.empty()) {
- vector<IndelAllele>::iterator a = indels.begin();
- while (a != indels.end()) {
- if (debug) cerr << "parsing " << *a << endl;
- if (!(a->length > 0 && a->position >= 0)) {
- ++numEmptyIndels;
- }
- ++a;
- }
- }
- */
-
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (i->length == 0) continue;
- if (i->insertion) {
- if (querySequence.substr(i->readPosition, i->readLength()) != i->sequence) {
- cerr << "failure: " << *i << " should be " << querySequence.substr(i->readPosition, i->readLength()) << endl;
- cerr << baseReferenceSequence << endl;
- cerr << querySequence << endl;
- throw 1;
- }
- } else {
- if (referenceSequence.substr(i->position, i->length) != i->sequence) {
- cerr << "failure: " << *i << " should be " << referenceSequence.substr(i->position, i->length) << endl;
- cerr << baseReferenceSequence << endl;
- cerr << querySequence << endl;
- throw 1;
- }
- }
- }
-
- if (indels.size() > 1) {
- vector<IndelAllele>::iterator id = indels.begin();
- while ((id + 1) != indels.end()) {
- if (debug) {
- cerr << "indels: ";
- for (vector<IndelAllele>::iterator a = indels.begin(); a != indels.end(); ++a) cerr << *a << " ";
- cerr << endl;
- //for (vector<IndelAllele>::iterator a = newIndels.begin(); a != newIndels.end(); ++a) cerr << *a << " ";
- //cerr << endl;
- }
-
- // get the indels to try to merge
- while (id->length == 0 && (id + 1) != indels.end()) ++id;
- vector<IndelAllele>::iterator idn = (id + 1);
- while (idn != indels.end() && idn->length == 0) ++idn;
- if (idn == indels.end()) break;
-
- IndelAllele& indel = *idn;
- IndelAllele& last = *id;
- if (debug) cerr << "trying " << last << " against " << indel << endl;
-
- int lastend = last.insertion ? last.position : (last.position + last.length);
- if (indel.position == lastend) {
- if (debug) cerr << "indel.position " << indel.position << " lastend " << lastend << endl;
- if (indel.insertion == last.insertion) {
- last.length += indel.length;
- last.sequence += indel.sequence;
- indel.length = 0;
- indel.sequence.clear();
- id = idn;
- } else if (last.length && indel.length) { // if the end of the previous == the start of the current, cut it off of both the ins and the del
- if (debug) cerr << "Merging " << last << " " << indel << endl;
- int matchsize = 1;
- int biggestmatchsize = 0;
-
- while (matchsize <= (int)last.sequence.size() && matchsize <= (int)indel.sequence.size()) {
- if (last.sequence.substr(last.sequence.size() - matchsize) == indel.sequence.substr(0, matchsize)) {
- biggestmatchsize = matchsize;
- }
- ++matchsize;
- }
- if (debug) cerr << "biggestmatchsize " << biggestmatchsize << endl;
-
- last.sequence = last.sequence.substr(0, last.sequence.size() - biggestmatchsize);
- last.length -= biggestmatchsize;
- indel.sequence = indel.sequence.substr(biggestmatchsize);
- indel.length -= biggestmatchsize;
- if (indel.insertion) indel.readPosition += biggestmatchsize;
- else indel.position += biggestmatchsize;
-
- if (indel.length > 0) {
- id = idn;
- }
- }
- } else {
- if (last.insertion != indel.insertion) {
- if (debug) cerr << "merging by overlap " << last << " " << indel << endl;
- // see if we can slide the sequence in between these two indels together
- string lastOverlapSeq;
- string indelOverlapSeq;
-
- if (last.insertion) {
- lastOverlapSeq =
- last.sequence
- + querySequence.substr(last.readPosition + last.readLength(),
- indel.readPosition - (last.readPosition + last.readLength()));
- indelOverlapSeq =
- referenceSequence.substr(last.position + last.referenceLength(),
- indel.position - (last.position + last.referenceLength()))
- + indel.sequence;
- } else {
- lastOverlapSeq =
- last.sequence
- + referenceSequence.substr(last.position + last.referenceLength(),
- indel.position - (last.position + last.referenceLength()));
- indelOverlapSeq =
- querySequence.substr(last.readPosition + last.readLength(),
- indel.readPosition - (last.readPosition + last.readLength()))
- + indel.sequence;
- }
-
- if (debug) {
- if (!last.insertion) {
- if (last.insertion) cerr << string(last.length, '-');
- cerr << lastOverlapSeq;
- if (indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- if (!last.insertion) cerr << string(last.length, '-');
- cerr << indelOverlapSeq;
- if (!indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- } else {
- if (last.insertion) cerr << string(last.length, '-');
- cerr << indelOverlapSeq;
- if (indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- if (!last.insertion) cerr << string(last.length, '-');
- cerr << lastOverlapSeq;
- if (!indel.insertion) cerr << string(indel.length, '-');
- cerr << endl;
- }
- }
-
-
- int dist = min(last.length, indel.length);
- int matchingInBetween = indel.position - (last.position + last.referenceLength());
- int previousMatchingInBetween = matchingInBetween;
- //int matchingInBetween = indel.position - last.position;
- if (debug) cerr << "matchingInBetween " << matchingInBetween << endl;
- if (debug) cerr << "dist " << dist << endl;
- //int mindist = matchingInBetween - dist;
- if (lastOverlapSeq == indelOverlapSeq) {
- matchingInBetween = lastOverlapSeq.size();
- } else {
- // TODO change to use string::find()
- for (int i = dist; i > 0; --i) {
- if (debug) cerr << "lastoverlap: "
- << lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
- << " thisoverlap: "
- << indelOverlapSeq.substr(0, i + previousMatchingInBetween) << endl;
- if (lastOverlapSeq.substr(lastOverlapSeq.size() - previousMatchingInBetween - i)
- == indelOverlapSeq.substr(0, i + previousMatchingInBetween)) {
- matchingInBetween = previousMatchingInBetween + i;
- break;
- }
- }
- }
- //cerr << last << " " << indel << endl;
- if (matchingInBetween > 0 && matchingInBetween > previousMatchingInBetween) {
- if (debug) cerr << "matching " << matchingInBetween << "bp between " << last << " " << indel
- << " was matching " << previousMatchingInBetween << endl;
- int diff = matchingInBetween - previousMatchingInBetween;
- last.length -= diff;
- last.sequence = last.sequence.substr(0, last.length);
- indel.length -= diff;
- indel.sequence = indel.sequence.substr(diff);
- if (!indel.insertion) indel.position += diff;
- else indel.readPosition += diff;
- if (debug) cerr << last << " " << indel << endl;
- }// else if (matchingInBetween == 0 || matchingInBetween == indel.position - last.position) {
- //if (!newIndels.empty()) newIndels.pop_back();
- //} //else { newIndels.push_back(indel); }
- id = idn;
- //newIndels.push_back(indel);
- } else {
- id = idn;
- //newIndels.push_back(indel);
- }
- }
- }
- }
-
- vector<IndelAllele> newIndels;
- for (vector<IndelAllele>::iterator i = indels.begin(); i != indels.end(); ++i) {
- if (!i->insertion && i->position == 0) { offset += i->length;
- } else if (i->length > 0) newIndels.push_back(*i); // remove dels at front
- }
-
- // for each indel
- // if ( we're matched up to the previous insertion (or deletion)
- // and it's also an insertion or deletion )
- // merge the indels
- //
- // and simultaneously reconstruct the cigar
- //
- // if there are spurious deletions at the start and end of the read, remove them
- // if there are spurious insertions after soft-clipped bases, make them soft clips
-
- vector<pair<int, string> > newCigar;
-
- if (!softBegin.empty()) {
- newCigar.push_back(make_pair(softBegin.size(), "S"));
- }
-
- if (newIndels.empty()) {
-
- int remainingReadBp = querySequence.size() - softEnd.size() - softBegin.size();
- newCigar.push_back(make_pair(remainingReadBp, "M"));
-
- if (!softEnd.empty()) {
- newCigar.push_back(make_pair(softEnd.size(), "S"));
- }
-
- cigar = joinCIGAR(newCigar);
-
- // check if we're realigned
- if (cigar == cigarbefore) {
- return false;
- } else {
- return true;
- }
- }
-
- vector<IndelAllele>::iterator id = newIndels.begin();
- vector<IndelAllele>::iterator last = id++;
-
- if (last->position > 0) {
- newCigar.push_back(make_pair(last->position, "M"));
- newCigar.push_back(make_pair(last->length, (last->insertion ? "I" : "D")));
- } else if (last->position == 0) { // discard floating indels
- if (last->insertion) newCigar.push_back(make_pair(last->length, "S"));
- else newCigar.push_back(make_pair(last->length, "D"));
- } else {
- cerr << "negative indel position " << *last << endl;
- }
-
- int lastend = last->insertion ? last->position : (last->position + last->length);
- LEFTALIGN_DEBUG(*last << ",");
-
- for (; id != newIndels.end(); ++id) {
- IndelAllele& indel = *id;
- if (indel.length == 0) continue; // remove 0-length indels
- if (debug) cerr << indel << " " << *last << endl;
- LEFTALIGN_DEBUG(indel << ",");
- if ((((id + 1) == newIndels.end()
- && indel.insertion && indel.position == (int)referenceSequence.size())
- || (!indel.insertion && indel.position + indel.length == (int)referenceSequence.size()))
- ) {
- if (indel.insertion) {
- if (!newCigar.empty() && newCigar.back().second == "S") {
- newCigar.back().first += indel.length;
- } else {
- newCigar.push_back(make_pair(indel.length, "S"));
- }
- }
- } else if (indel.position < lastend) {
- cerr << "impossibility?: indel realigned left of another indel" << endl;
- return false;
- } else if (indel.position == lastend) {
- // how?
- if (indel.insertion == last->insertion) {
- pair<int, string>& op = newCigar.back();
- op.first += indel.length;
- } else {
- newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
- }
- } else if (indel.position > lastend) { // also catches differential indels, but with the same position
- if (!newCigar.empty() && newCigar.back().second == "M") newCigar.back().first += indel.position - lastend;
- else newCigar.push_back(make_pair(indel.position - lastend, "M"));
- newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D")));
- }
-
- last = id;
- lastend = last->insertion ? last->position : (last->position + last->length);
-
- if (debug) {
- for (vector<pair<int, string> >::iterator c = newCigar.begin(); c != newCigar.end(); ++c)
- cerr << c->first << c->second;
- cerr << endl;
- }
-
- }
-
- int remainingReadBp = querySequence.size() - (last->readPosition + last->readLength()) - softEnd.size();
- if (remainingReadBp > 0) {
- if (debug) cerr << "bp remaining = " << remainingReadBp << endl;
- if (newCigar.back().second == "M") newCigar.back().first += remainingReadBp;
- else newCigar.push_back(make_pair(remainingReadBp, "M"));
- }
-
- if (newCigar.back().second == "D") newCigar.pop_back(); // remove trailing deletions
-
- if (!softEnd.empty()) {
- if (newCigar.back().second == "S") newCigar.back().first += softEnd.size();
- else newCigar.push_back(make_pair(softEnd.size(), "S"));
- }
-
- LEFTALIGN_DEBUG(endl);
-
- cigar = joinCIGAR(newCigar);
-
- LEFTALIGN_DEBUG(cigar << endl);
-
- // check if we're realigned
- if (cigar == cigarbefore) {
- return false;
- } else {
- return true;
- }
-
-}
-
-int countMismatches(string& querySequence, string& cigar, string referenceSequence) {
-
- int mismatches = 0;
- int sp = 0;
- int rp = 0;
- vector<pair<int, string> > cigarData = splitCIGAR(cigar);
- for (vector<pair<int, string> >::const_iterator c = cigarData.begin();
- c != cigarData.end(); ++c) {
- unsigned int l = c->first;
- string t = c->second;
- if (t == "M") { // match or mismatch
- for (int i = 0; i < (int)l; ++i) {
- if (querySequence.at(rp) != referenceSequence.at(sp))
- ++mismatches;
- ++sp;
- ++rp;
- }
- } else if (t == "D") { // deletion
- sp += l; // update reference sequence position
- } else if (t == "I") { // insertion
- rp += l; // update read position
- } else if (t == "S") { // soft clip, clipped sequence present in the read not matching the reference
- rp += l;
- } else if (t == "H") { // hard clip on the read, clipped sequence is not present in the read
- } else if (t == "N") { // skipped region in the reference not present in read, aka splice
- sp += l;
- }
- }
-
- return mismatches;
-
-}
-
-// Iteratively left-aligns the indels in the alignment until we have a stable
-// realignment. Returns true on realignment success or non-realignment.
-// Returns false if we exceed the maximum number of realignment iterations.
-//
-bool stablyLeftAlign(string querySequence, string& cigar, string referenceSequence, int& offset, int maxiterations, bool debug) {
-
- if (!leftAlign(querySequence, cigar, referenceSequence, offset)) {
-
- LEFTALIGN_DEBUG("did not realign" << endl);
- return true;
-
- } else {
-
- while (leftAlign(querySequence, cigar, referenceSequence, offset) && --maxiterations > 0) {
- LEFTALIGN_DEBUG("realigning ..." << endl);
- }
-
- if (maxiterations <= 0) {
- return false;
- } else {
- return true;
- }
- }
-}
-
-string mergeCIGAR(const string& c1, const string& c2) {
- vector<pair<int, string> > cigar1 = splitCIGAR(c1);
- vector<pair<int, string> > cigar2 = splitCIGAR(c2);
- // check if the middle elements are the same
- if (cigar1.back().second == cigar2.front().second) {
- cigar1.back().first += cigar2.front().first;
- cigar2.erase(cigar2.begin());
- }
- for (vector<pair<int, string> >::iterator c = cigar2.begin(); c != cigar2.end(); ++c) {
- cigar1.push_back(*c);
- }
- return joinCIGAR(cigar1);
-}
-
-vector<pair<int, string> > splitCIGAR(const string& cigarStr) {
- vector<pair<int, string> > cigar;
- string number;
- string type;
- // strings go [Number][Type] ...
- for (string::const_iterator s = cigarStr.begin(); s != cigarStr.end(); ++s) {
- char c = *s;
- if (isdigit(c)) {
- if (type.empty()) {
- number += c;
- } else {
- // signal for next token, push back the last pair, clean up
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- number.clear();
- type.clear();
- number += c;
- }
- } else {
- type += c;
- }
- }
- if (!number.empty() && !type.empty()) {
- cigar.push_back(make_pair(atoi(number.c_str()), type));
- }
- return cigar;
-}
-
-string joinCIGAR(const vector<pair<int, string> >& cigar) {
- string cigarStr;
- for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) {
- if (c->first) {
- cigarStr += convert(c->first) + c->second;
- }
- }
- return cigarStr;
-}
=====================================
external/vcflib/smithwaterman/LeftAlign.h deleted
=====================================
@@ -1,32 +0,0 @@
-#ifndef __LEFTALIGN_H
-#define __LEFTALIGN_H
-
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <list>
-#include <utility>
-#include <sstream>
-
-#include "IndelAllele.h"
-#include "convert.h"
-
-#ifdef VERBOSE_DEBUG
-#define LEFTALIGN_DEBUG(msg) \
- if (debug) { cerr << msg; }
-#else
-#define LEFTALIGN_DEBUG(msg)
-#endif
-
-using namespace std;
-
-bool leftAlign(string& alternateQuery, string& cigar, string& referenceSequence, int& offset, bool debug = false);
-bool stablyLeftAlign(string alternateQuery, string& cigar, string referenceSequence, int& offset, int maxiterations = 20, bool debug = false);
-int countMismatches(string& alternateQuery, string& cigar, string& referenceSequence);
-
-string mergeCIGAR(const string& c1, const string& c2);
-vector<pair<int, string> > splitCIGAR(const string& cigarStr);
-string joinCIGAR(const vector<pair<int, string> >& cigar);
-
-
-#endif
=====================================
external/vcflib/smithwaterman/Makefile deleted
=====================================
@@ -1,34 +0,0 @@
-# =========================================
-# MOSAIK Banded Smith-Waterman Makefile
-# (c) 2009 Michael Stromberg & Wan-Ping Lee
-# =========================================
-
-# ----------------------------------
-# define our source and object files
-# ----------------------------------
-SOURCES= smithwaterman.cpp BandedSmithWaterman.cpp SmithWatermanGotoh.cpp Repeats.cpp LeftAlign.cpp IndelAllele.cpp
-OBJECTS= $(SOURCES:.cpp=.o)
-
-# ----------------
-# compiler options
-# ----------------
-
-CFLAGS=-Wall -O3
-LDFLAGS=-Wl,-s
-#CFLAGS=-g
-PROGRAM=smithwaterman
-LIBS=
-
-all: $(PROGRAM)
-
-.PHONY: all
-
-$(PROGRAM): $(OBJECTS)
- @echo " * linking $(PROGRAM)"
- @$(CXX) $(LDFLAGS) $(CFLAGS) -o $@ $^ $(LIBS)
-
-.PHONY: clean
-
-clean:
- @echo "Cleaning up."
- @rm -f *.o $(PROGRAM) *~
=====================================
external/vcflib/smithwaterman/Mosaik.h deleted
=====================================
@@ -1,73 +0,0 @@
-#pragma once
-
-#ifndef WIN32
-//#include "SafeFunctions.h"
-#endif
-
-// ==============
-// MOSAIK version
-// ==============
-
-#define MOSAIK_VERSION_DATE "2009-02-11"
-
-// adopt a major.minor.build version number [1].[1].[3]
-const unsigned char MOSAIK_MAJOR_VERSION = 0;
-const unsigned char MOSAIK_MINOR_VERSION = 9;
-const unsigned short MOSAIK_BUILD_VERSION = 899;
-
-// ================================
-// Platform specific variable sizes
-// ================================
-
-// Windows Vista 32-bit
-// Fedora Core 7 32-bit
-// Fedora Core 6 64-bit
-// Itanium2 64-bit
-#define SIZEOF_CHAR 1
-#define SIZEOF_WCHAR 2
-#define SIZEOF_SHORT 2
-#define SIZEOF_INT 4
-#define SIZEOF_FLOAT 4
-#define SIZEOF_DOUBLE 8
-#define SIZEOF_UINT64 8
-#define MOSAIK_LITTLE_ENDIAN 1
-
-#ifdef WIN32
-typedef signed long long int64_t;
-typedef unsigned long long uint64_t;
-#endif
-
-#define NEGATIVE_ONE_INT 0xffffffff
-#define NEGATIVE_TWO_INT 0xfffffffe
-#define NEGATIVE_THREE_INT 0xfffffffd
-#define NEGATIVE_FOUR_INT 0xfffffffc
-#define MAX_SHORT 0xffff
-
-// ==========================
-// Platform specific file I/O
-// ==========================
-
-#ifdef WIN32
-const char OS_DIRECTORY_SEPARATOR = '\\';
-#else
-const char OS_DIRECTORY_SEPARATOR = '/';
-#endif
-
-#define DIRECTORY_NAME_LENGTH 255
-
-// ====================================
-// Enable unit test diagnostic messages
-// ====================================
-
-#ifdef UNITTEST
-#define SILENTMODE if(0)
-#else
-#define SILENTMODE
-#endif
-
-// =================
-// Aligner constants
-// =================
-
-const double HASH_REGION_EXTENSION_PERCENT = 0.025;
-const unsigned char REFERENCE_SEQUENCE_QUALITY = 40;
=====================================
external/vcflib/smithwaterman/Repeats.cpp deleted
=====================================
@@ -1,69 +0,0 @@
-#include "Repeats.h"
-
-map<string, int> repeatCounts(long int position, const string& sequence, int maxsize) {
- map<string, int> counts;
- for (int i = 1; i <= maxsize; ++i) {
- // subseq here i bases
- string seq = sequence.substr(position, i);
- // go left.
-
- int j = position - i;
- int leftsteps = 0;
- while (j >= 0 && seq == sequence.substr(j, i)) {
- j -= i;
- ++leftsteps;
- }
-
- // go right.
- j = position;
-
- int rightsteps = 0;
- while (j + i <= (int)sequence.size() && seq == sequence.substr(j, i)) {
- j += i;
- ++rightsteps;
- }
- // if we went left and right a non-zero number of times,
- if (leftsteps + rightsteps > 1) {
- counts[seq] = leftsteps + rightsteps;
- }
- }
-
- // filter out redundant repeat information
- if (counts.size() > 1) {
- map<string, int> filteredcounts;
- map<string, int>::iterator c = counts.begin();
- string prev = c->first;
- filteredcounts[prev] = c->second; // shortest sequence
- ++c;
- for (; c != counts.end(); ++c) {
- int i = 0;
- string seq = c->first;
- while (i + prev.length() <= seq.length() && seq.substr(i, prev.length()) == prev) {
- i += prev.length();
- }
- if (i < (int)seq.length()) {
- filteredcounts[seq] = c->second;
- prev = seq;
- }
- }
- return filteredcounts;
- } else {
- return counts;
- }
-}
-
-bool isRepeatUnit(const string& seq, const string& unit) {
-
- if (seq.size() % unit.size() != 0) {
- return false;
- } else {
- int maxrepeats = seq.size() / unit.size();
- for (int i = 0; i < maxrepeats; ++i) {
- if (seq.substr(i * unit.size(), unit.size()) != unit) {
- return false;
- }
- }
- return true;
- }
-
-}
=====================================
external/vcflib/smithwaterman/Repeats.h deleted
=====================================
@@ -1,8 +0,0 @@
-#include <iostream>
-#include <string>
-#include <map>
-
-using namespace std;
-
-map<string, int> repeatCounts(long int pos, const string& seq, int maxsize);
-bool isRepeatUnit(const string& seq, const string& unit);
=====================================
external/vcflib/smithwaterman/SWMain.cpp deleted
=====================================
@@ -1,126 +0,0 @@
-#include <iostream>
-#include <string.h>
-//#include "Alignment.h"
-//#include "Benchmark.h"
-//#include "HashRegion.h"
-#include "SmithWatermanGotoh.h"
-#include "BandedSmithWaterman.h"
-
-using namespace std;
-
-int main(int argc, char* argv[]) {
-/*
- printf("------------------------------------------------------------------------------\n");
- printf("Banded Smith-Waterman Algorithm (worst case)\n");
- printf("Michael Stromberg & Wan-Ping Lee Marth Lab, Boston College Biology Department\n");
- printf("------------------------------------------------------------------------------\n\n");
-*/
- // this version simulates the worst case of only a fragment hashing to the
- // reference sequence. Basically a non-centered diagonal in the Smith-Waterman
- // dynamic programming matrix.
-
- // here we simulate a region on the reference that occurs between position 4001
- // and position 4136. During hashing, only the first 20 bases in the query
- // matched perfectly.
-
- // define the start and end coordinates of the entire reference region
- //const unsigned int start = 4001;
- //const unsigned int end = 4136;
-
- //const unsigned int testStart = atoi(argv[1]);
- //const unsigned int testEnd = atoi(argv[2]);
- //const unsigned int testQueryStart = atoi(argv[3]);
- //const unsigned int testQueryEnd = atoi(argv[4]);
-
- //cout << endl<< "=====================================================" << endl;
- //cout << testStart << "\t" << testQueryStart << endl;
-
- // define the 20 b:q
- // ases that matched perfectly
- //HashRegion hr;
-
- //=====================================================
- // defind the hash region
- // first.first: reference begin
- // first.second: reference end
- // second.first: query begin
- // second.second: query end
- //=====================================================
-
- pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
- hr.first.first = 5;
- hr.first.second = 13;
- hr.second.first = 0;
- hr.second.second = 8;
-
- //=====================================================
-
- // for 76 bp reads, we expect as much as 12 mismatches - however this does not
- // translate to a bandwidth of 12 * 2 + 1 since most of these will be
- // substitution errors
- const unsigned char bandwidth = 11;
-
- // initialize
- const char* pReference = "ATGGCGGGGATCGGGACACTCGCCGGTGCGGGTACCCTA";
- const char* pQuery = "GGGGATCGGGACACTCGCTCTCCGGTGCGGGTA";
-
- const unsigned int referenceLen = strlen(pReference);
- const unsigned int queryLen = strlen(pQuery);
-
- // ==============================================================================================
- // benchmarking reference on koi.bc.edu when NUM_ITERATIONS = 38000 on 76 bp read (1 try):
- // CPU time: 23.920 s, wall time: 24.012 s (1582.6 alignments/s)
- // ==============================================================================================
- //const unsigned int NUM_ITERATIONS = 38000;
- //unsigned int NUM_ITERATIONS = 1;
-
- // create a new Smith-Waterman alignment object
- CSmithWatermanGotoh sw(10.0f, -9.0f, 15.0f, 6.66f);
- CBandedSmithWaterman bsw(10.0f, -9.0f, 15.0f, 6.66f, bandwidth);
-
- // start timing the algorithm
- //CBenchmark bench;
- //bench.Start();
-
- // perform NUM_ITERATIONS alignments
- //Alignment bswAl;
- //Alignment swAl;
- // referenceBegin, referenceEnd
- unsigned int referenceSW, referenceBSW;
- string cigarSW, cigarBSW;
- //for(unsigned int i = 0; i < NUM_ITERATIONS; i++) {
- sw.Align(referenceSW, cigarSW, pReference, referenceLen, pQuery, queryLen);
- bsw.Align(referenceBSW, cigarBSW, pReference, referenceLen, pQuery, queryLen, hr);
- //}
-
- // stop timing the algorithm
- //bench.Stop();
-
- // calculate the alignments per second
- //double elapsedWallTime = bench.GetElapsedWallTime();
- //double alignmentsPerSecond = (double)NUM_ITERATIONS / elapsedWallTime;
-
- // show our results
- //printf("%d\t%d\n", al.ReferenceBegin,al.QueryBegin);
-
- printf("Smith-Waterman\n");
- printf("reference: %s %3u\n", cigarSW.c_str(), referenceSW);
- printf("Banded Smith-Waterman\n");
- printf("reference: %s %3u\n", cigarBSW.c_str(), referenceBSW);
- /*
- printf("Smith-Waterman\n");
- printf("reference: %s %3u %3u\n", swAl.Reference.CData(), swAl.ReferenceBegin, swAl.ReferenceEnd);
- printf("query: %s %3u %3u\n", swAl.Query.CData(), swAl.QueryBegin, swAl.QueryEnd);
- printf("mismatches: %u\n", swAl.NumMismatches);
- printf("\n");
- printf("Banded Smith-Waterman\n");
- printf("reference: %s %3u %3u\n", bswAl.Reference.CData(), bswAl.ReferenceBegin, bswAl.ReferenceEnd);
- printf("query: %s %3u %3u\n", bswAl.Query.CData(), bswAl.QueryBegin, bswAl.QueryEnd);
- printf("mismatches: %u\n", bswAl.NumMismatches);
- */
- //printf("alignments/s: %.1f\n\n", alignmentsPerSecond);
-
- //bench.DisplayTime("BandedSmithWaterman");
-
- return 0;
-}
=====================================
external/vcflib/smithwaterman/SmithWatermanGotoh.cpp deleted
=====================================
@@ -1,741 +0,0 @@
-#include "SmithWatermanGotoh.h"
-
-const float CSmithWatermanGotoh::FLOAT_NEGATIVE_INFINITY = (float)-1e+30;
-
-const char CSmithWatermanGotoh::Directions_STOP = 0;
-const char CSmithWatermanGotoh::Directions_LEFT = 1;
-const char CSmithWatermanGotoh::Directions_DIAGONAL = 2;
-const char CSmithWatermanGotoh::Directions_UP = 3;
-
-const int CSmithWatermanGotoh::repeat_size_max = 12;
-
-CSmithWatermanGotoh::CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty)
- : mCurrentMatrixSize(0)
- , mCurrentAnchorSize(0)
- , mCurrentQuerySize(0)
- , mCurrentAQSumSize(0)
- , mMatchScore(matchScore)
- , mMismatchScore(mismatchScore)
- , mGapOpenPenalty(gapOpenPenalty)
- , mGapExtendPenalty(gapExtendPenalty)
- , mPointers(NULL)
- , mSizesOfVerticalGaps(NULL)
- , mSizesOfHorizontalGaps(NULL)
- , mQueryGapScores(NULL)
- , mBestScores(NULL)
- , mReversedAnchor(NULL)
- , mReversedQuery(NULL)
- , mUseHomoPolymerGapOpenPenalty(false)
- , mUseEntropyGapOpenPenalty(false)
- , mUseRepeatGapExtensionPenalty(false)
-{
- CreateScoringMatrix();
-}
-
-CSmithWatermanGotoh::~CSmithWatermanGotoh(void) {
- if(mPointers) delete [] mPointers;
- if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
- if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
- if(mQueryGapScores) delete [] mQueryGapScores;
- if(mBestScores) delete [] mBestScores;
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-}
-
-// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
-void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {
-
- if((s1.length() == 0) || (s2.length() == 0)) {
- cout << "ERROR: Found a read with a zero length." << endl;
- exit(1);
- }
-
- unsigned int referenceLen = s1.length() + 1;
- unsigned int queryLen = s2.length() + 1;
- unsigned int sequenceSumLength = s1.length() + s2.length();
-
- // reinitialize our matrices
-
- if((referenceLen * queryLen) > mCurrentMatrixSize) {
-
- // calculate the new matrix size
- mCurrentMatrixSize = referenceLen * queryLen;
-
- // delete the old arrays
- if(mPointers) delete [] mPointers;
- if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps;
- if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;
-
- try {
-
- // initialize the arrays
- mPointers = new char[mCurrentMatrixSize];
- mSizesOfVerticalGaps = new short[mCurrentMatrixSize];
- mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // initialize the traceback matrix to STOP
- memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
- for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;
-
- // initialize the gap matrices to 1
- uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
- uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);
-
-
- // initialize our repeat counts if they are needed
- vector<map<string, int> > referenceRepeats;
- vector<map<string, int> > queryRepeats;
- //int queryBeginRepeatBases = 0;
- //int queryEndRepeatBases = 0;
- if (mUseRepeatGapExtensionPenalty) {
- for (unsigned int i = 0; i < queryLen; ++i)
- queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
- for (unsigned int i = 0; i < referenceLen; ++i)
- referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));
-
- // keep only the biggest repeat
- vector<map<string, int> >::iterator q = queryRepeats.begin();
- for (; q != queryRepeats.end(); ++q) {
- map<string, int>::iterator biggest = q->begin();
- map<string, int>::iterator z = q->begin();
- for (; z != q->end(); ++z)
- if (z->first.size() > biggest->first.size()) biggest = z;
- z = q->begin();
- while (z != q->end()) {
- if (z != biggest)
- q->erase(z++);
- else ++z;
- }
- }
-
- q = referenceRepeats.begin();
- for (; q != referenceRepeats.end(); ++q) {
- map<string, int>::iterator biggest = q->begin();
- map<string, int>::iterator z = q->begin();
- for (; z != q->end(); ++z)
- if (z->first.size() > biggest->first.size()) biggest = z;
- z = q->begin();
- while (z != q->end()) {
- if (z != biggest)
- q->erase(z++);
- else ++z;
- }
- }
-
- // remove repeat information from ends of queries
- // this results in the addition of spurious flanking deletions in repeats
- map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
- if (!qrend.empty()) {
- int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
- for (int i = 0; i < queryEndRepeatBases; ++i)
- queryRepeats.at(queryRepeats.size() - 2 - i).clear();
- }
-
- map<string, int>& qrbegin = queryRepeats.front();
- if (!qrbegin.empty()) {
- int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
- for (int i = 0; i < queryBeginRepeatBases; ++i)
- queryRepeats.at(i).clear();
- }
-
- }
-
- //int entropyWindowSize = 8;
- vector<float> referenceEntropies;
- vector<float> queryEntropies;
- /*if (mUseEntropyGapOpenPenalty) {
- for (unsigned int i = 0; i < queryLen; ++i)
- queryEntropies.push_back(
- shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
- entropyWindowSize));
- for (unsigned int i = 0; i < referenceLen; ++i)
- referenceEntropies.push_back(
- shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
- entropyWindowSize));
- }
- */
- // normalize entropies
- /*
- float qsum = 0;
- float qnorm = 0;
- float qmax = 0;
- for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
- qsum += *q;
- if (*q > qmax) qmax = *q;
- }
- qnorm = qsum / queryEntropies.size();
- for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
- *q = *q / qsum + qmax;
-
- float rsum = 0;
- float rnorm = 0;
- float rmax = 0;
- for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
- rsum += *r;
- if (*r > rmax) rmax = *r;
- }
- rnorm = rsum / referenceEntropies.size();
- for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
- *r = *r / rsum + rmax;
- */
-
- //
- // construct
- //
-
- // reinitialize our query-dependent arrays
- if(s2.length() > mCurrentQuerySize) {
-
- // calculate the new query array size
- mCurrentQuerySize = s2.length();
-
- // delete the old arrays
- if(mQueryGapScores) delete [] mQueryGapScores;
- if(mBestScores) delete [] mBestScores;
-
- // initialize the arrays
- try {
-
- mQueryGapScores = new float[mCurrentQuerySize + 1];
- mBestScores = new float[mCurrentQuerySize + 1];
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // reinitialize our reference+query-dependent arrays
- if(sequenceSumLength > mCurrentAQSumSize) {
-
- // calculate the new reference array size
- mCurrentAQSumSize = sequenceSumLength;
-
- // delete the old arrays
- if(mReversedAnchor) delete [] mReversedAnchor;
- if(mReversedQuery) delete [] mReversedQuery;
-
- // initialize the arrays
- try {
-
- mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1
- mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2
-
- } catch(bad_alloc) {
- cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
- exit(1);
- }
- }
-
- // initialize the gap score and score vectors
- uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
- memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);
-
- float similarityScore, totalSimilarityScore, bestScoreDiagonal;
- float queryGapExtendScore, queryGapOpenScore;
- float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;
-
- unsigned int BestColumn = 0;
- unsigned int BestRow = 0;
- float BestScore = FLOAT_NEGATIVE_INFINITY;
-
- for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {
-
- currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
- bestScoreDiagonal = mBestScores[0];
-
- for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {
-
- // calculate our similarity score
- similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];
-
- // fill the matrices
- totalSimilarityScore = bestScoreDiagonal + similarityScore;
-
- //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;
-
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- queryGapOpenScore = mBestScores[j] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((j > 1) && (s2[j - 1] == s2[j - 2]))
- queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
-
- // compute the entropy gap score if enabled
- if (mUseEntropyGapOpenPenalty) {
- queryGapOpenScore =
- mBestScores[j] - mGapOpenPenalty
- * max(queryEntropies.at(j), referenceEntropies.at(i))
- * mEntropyGapOpenPenalty;
- }
-
- int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;
-
- if (mUseRepeatGapExtensionPenalty) {
- map<string, int>& repeats = queryRepeats[j];
- // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
- if (!repeats.empty()) {
-
- const pair<string, int>& repeat = *repeats.begin();
- int repeatsize = repeat.first.size();
- if (gaplen != repeatsize && gaplen % repeatsize != 0) {
- gaplen = gaplen / repeatsize + repeatsize;
- }
-
- if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
- string gapseq = string(&s1[i], gaplen);
- if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
- queryGapExtendScore = mQueryGapScores[j]
- + mRepeatGapExtensionPenalty / (float) gaplen;
- // mMaxRepeatGapExtensionPenalty)
- } else {
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- }
- }
- } else {
- queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
- }
- }
-
- if(queryGapExtendScore > queryGapOpenScore) {
- mQueryGapScores[j] = queryGapExtendScore;
- mSizesOfVerticalGaps[l] = gaplen;
- } else mQueryGapScores[j] = queryGapOpenScore;
-
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty;
-
- // compute the homo-polymer gap score if enabled
- if(mUseHomoPolymerGapOpenPenalty)
- if((i > 1) && (s1[i - 1] == s1[i - 2]))
- referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
-
- // compute the entropy gap score if enabled
- if (mUseEntropyGapOpenPenalty) {
- referenceGapOpenScore =
- mBestScores[j - 1] - mGapOpenPenalty
- * max(queryEntropies.at(j), referenceEntropies.at(i))
- * mEntropyGapOpenPenalty;
- }
-
- gaplen = mSizesOfHorizontalGaps[l - 1] + 1;
-
- if (mUseRepeatGapExtensionPenalty) {
- map<string, int>& repeats = referenceRepeats[i];
- // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
- if (!repeats.empty()) {
-
- const pair<string, int>& repeat = *repeats.begin();
- int repeatsize = repeat.first.size();
- if (gaplen != repeatsize && gaplen % repeatsize != 0) {
- gaplen = gaplen / repeatsize + repeatsize;
- }
-
- if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
- string gapseq = string(&s2[j], gaplen);
- if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
- referenceGapExtendScore = currentAnchorGapScore
- + mRepeatGapExtensionPenalty / (float) gaplen;
- //mMaxRepeatGapExtensionPenalty)
- } else {
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- }
- }
- } else {
- referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
- }
- }
-
- if(referenceGapExtendScore > referenceGapOpenScore) {
- currentAnchorGapScore = referenceGapExtendScore;
- mSizesOfHorizontalGaps[l] = gaplen;
- } else currentAnchorGapScore = referenceGapOpenScore;
-
- bestScoreDiagonal = mBestScores[j];
- mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
-
-
- // determine the traceback direction
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- if(mBestScores[j] == 0) mPointers[l] = Directions_STOP;
- else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
- else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP;
- else mPointers[l] = Directions_LEFT;
-
- // set the traceback start at the current cell i, j and score
- if(mBestScores[j] > BestScore) {
- BestRow = i;
- BestColumn = j;
- BestScore = mBestScores[j];
- }
- }
- }
-
- //
- // traceback
- //
-
- // aligned sequences
- int gappedAnchorLen = 0; // length of sequence #1 after alignment
- int gappedQueryLen = 0; // length of sequence #2 after alignment
- int numMismatches = 0; // the mismatched nucleotide count
-
- char c1, c2;
-
- int ci = BestRow;
- int cj = BestColumn;
- int ck = ci * queryLen;
-
- // traceback flag
- bool keepProcessing = true;
-
- while(keepProcessing) {
- //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl;
-
- // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
- switch(mPointers[ck + cj]) {
-
- case Directions_DIAGONAL:
- c1 = s1[--ci];
- c2 = s2[--cj];
- ck -= queryLen;
-
- mReversedAnchor[gappedAnchorLen++] = c1;
- mReversedQuery[gappedQueryLen++] = c2;
-
- // increment our mismatch counter
- if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;
- break;
-
- case Directions_STOP:
- keepProcessing = false;
- break;
-
- case Directions_UP:
- for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
- if (ci <= 0) {
- keepProcessing = false;
- break;
- }
- mReversedAnchor[gappedAnchorLen++] = s1[--ci];
- mReversedQuery[gappedQueryLen++] = GAP;
- ck -= queryLen;
- numMismatches++;
- }
- break;
-
- case Directions_LEFT:
- for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
- if (cj <= 0) {
- keepProcessing = false;
- break;
- }
- mReversedAnchor[gappedAnchorLen++] = GAP;
- mReversedQuery[gappedQueryLen++] = s2[--cj];
- numMismatches++;
- }
- break;
- }
- }
-
- // define the reference and query sequences
- mReversedAnchor[gappedAnchorLen] = 0;
- mReversedQuery[gappedQueryLen] = 0;
-
- // catch sequences with different lengths
- if(gappedAnchorLen != gappedQueryLen) {
- cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
- exit(1);
- }
-
- // reverse the strings and assign them to our alignment structure
- reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
- reverse(mReversedQuery, mReversedQuery + gappedQueryLen);
-
- //alignment.Reference = mReversedAnchor;
- //alignment.Query = mReversedQuery;
-
- // set the reference endpoints
- //alignment.ReferenceBegin = ci;
- //alignment.ReferenceEnd = BestRow - 1;
- referenceAl = ci;
-
- // set the query endpoints
- /*
- if(alignment.IsReverseComplement) {
- alignment.QueryBegin = s2Length - BestColumn;
- alignment.QueryEnd = s2Length - cj - 1;
- // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
- } else {
- alignment.QueryBegin = cj;
- alignment.QueryEnd = BestColumn - 1;
- // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
- }
- */
-
- // set the query length and number of mismatches
- //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
- //alignment.NumMismatches = numMismatches;
-
- unsigned int alLength = strlen(mReversedAnchor);
- unsigned int m = 0, d = 0, i = 0;
- bool dashRegion = false;
- ostringstream oCigar (ostringstream::out);
- int insertedBases = 0;
-
- if ( cj != 0 ) {
- if ( cj > 0 ) {
- oCigar << cj << 'S';
- } else { // how do we get negative cj's?
- referenceAl -= cj;
- alLength += cj;
- }
- }
-
- for ( unsigned int j = 0; j < alLength; j++ ) {
- // m
- if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
- if ( dashRegion ) {
- if ( d != 0 ) oCigar << d << 'D';
- else { oCigar << i << 'I'; insertedBases += i; }
- }
- dashRegion = false;
- m++;
- d = 0;
- i = 0;
- }
- else {
- if ( !dashRegion && m )
- oCigar << m << 'M';
- dashRegion = true;
- m = 0;
- if ( mReversedAnchor[j] == GAP ) {
- if ( d != 0 ) oCigar << d << 'D';
- i++;
- d = 0;
- }
- else {
- if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
- d++;
- i = 0;
- }
- }
- }
- if ( m != 0 ) oCigar << m << 'M';
- else if ( d != 0 ) oCigar << d << 'D';
- else if ( i != 0 ) oCigar << i << 'I';
-
- if ( BestColumn != s2.length() )
- oCigar << s2.length() - BestColumn << 'S';
-
- cigarAl = oCigar.str();
-
- // fix the gap order
- CorrectHomopolymerGapOrder(alLength, numMismatches);
-
- if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
- int offset = 0;
- string oldCigar;
- try {
- oldCigar = cigarAl;
- stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
- } catch (...) {
- cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
- cigarAl = oldCigar; // undo the failed left-realignment attempt
- offset = 0;
- }
- referenceAl += offset;
- }
-
-}
-
-// creates a simple scoring matrix to align the nucleotides and the ambiguity code N
-void CSmithWatermanGotoh::CreateScoringMatrix(void) {
-
- unsigned int nIndex = 13;
- unsigned int xIndex = 23;
-
- // define the N score to be 1/4 of the span between mismatch and match
- //const short nScore = mMismatchScore + (short)(((mMatchScore - mMismatchScore) / 4.0) + 0.5);
-
- // calculate the scoring matrix
- for(unsigned char i = 0; i < MOSAIK_NUM_NUCLEOTIDES; i++) {
- for(unsigned char j = 0; j < MOSAIK_NUM_NUCLEOTIDES; j++) {
-
- // N.B. matching N to everything (while conceptually correct) leads to some
- // bad alignments, lets make N be a mismatch instead.
-
- // add the matches or mismatches to the hashtable (N is a mismatch)
- if((i == nIndex) || (j == nIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if((i == xIndex) || (j == xIndex)) mScoringMatrix[i][j] = mMismatchScore;
- else if(i == j) mScoringMatrix[i][j] = mMatchScore;
- else mScoringMatrix[i][j] = mMismatchScore;
- }
- }
-
- // add ambiguity codes
- mScoringMatrix['M' - 'A']['A' - 'A'] = mMatchScore; // M - A
- mScoringMatrix['A' - 'A']['M' - 'A'] = mMatchScore;
- mScoringMatrix['M' - 'A']['C' - 'A'] = mMatchScore; // M - C
- mScoringMatrix['C' - 'A']['M' - 'A'] = mMatchScore;
-
- mScoringMatrix['R' - 'A']['A' - 'A'] = mMatchScore; // R - A
- mScoringMatrix['A' - 'A']['R' - 'A'] = mMatchScore;
- mScoringMatrix['R' - 'A']['G' - 'A'] = mMatchScore; // R - G
- mScoringMatrix['G' - 'A']['R' - 'A'] = mMatchScore;
-
- mScoringMatrix['W' - 'A']['A' - 'A'] = mMatchScore; // W - A
- mScoringMatrix['A' - 'A']['W' - 'A'] = mMatchScore;
- mScoringMatrix['W' - 'A']['T' - 'A'] = mMatchScore; // W - T
- mScoringMatrix['T' - 'A']['W' - 'A'] = mMatchScore;
-
- mScoringMatrix['S' - 'A']['C' - 'A'] = mMatchScore; // S - C
- mScoringMatrix['C' - 'A']['S' - 'A'] = mMatchScore;
- mScoringMatrix['S' - 'A']['G' - 'A'] = mMatchScore; // S - G
- mScoringMatrix['G' - 'A']['S' - 'A'] = mMatchScore;
-
- mScoringMatrix['Y' - 'A']['C' - 'A'] = mMatchScore; // Y - C
- mScoringMatrix['C' - 'A']['Y' - 'A'] = mMatchScore;
- mScoringMatrix['Y' - 'A']['T' - 'A'] = mMatchScore; // Y - T
- mScoringMatrix['T' - 'A']['Y' - 'A'] = mMatchScore;
-
- mScoringMatrix['K' - 'A']['G' - 'A'] = mMatchScore; // K - G
- mScoringMatrix['G' - 'A']['K' - 'A'] = mMatchScore;
- mScoringMatrix['K' - 'A']['T' - 'A'] = mMatchScore; // K - T
- mScoringMatrix['T' - 'A']['K' - 'A'] = mMatchScore;
-
- mScoringMatrix['V' - 'A']['A' - 'A'] = mMatchScore; // V - A
- mScoringMatrix['A' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['C' - 'A'] = mMatchScore; // V - C
- mScoringMatrix['C' - 'A']['V' - 'A'] = mMatchScore;
- mScoringMatrix['V' - 'A']['G' - 'A'] = mMatchScore; // V - G
- mScoringMatrix['G' - 'A']['V' - 'A'] = mMatchScore;
-
- mScoringMatrix['H' - 'A']['A' - 'A'] = mMatchScore; // H - A
- mScoringMatrix['A' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['C' - 'A'] = mMatchScore; // H - C
- mScoringMatrix['C' - 'A']['H' - 'A'] = mMatchScore;
- mScoringMatrix['H' - 'A']['T' - 'A'] = mMatchScore; // H - T
- mScoringMatrix['T' - 'A']['H' - 'A'] = mMatchScore;
-
- mScoringMatrix['D' - 'A']['A' - 'A'] = mMatchScore; // D - A
- mScoringMatrix['A' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['G' - 'A'] = mMatchScore; // D - G
- mScoringMatrix['G' - 'A']['D' - 'A'] = mMatchScore;
- mScoringMatrix['D' - 'A']['T' - 'A'] = mMatchScore; // D - T
- mScoringMatrix['T' - 'A']['D' - 'A'] = mMatchScore;
-
- mScoringMatrix['B' - 'A']['C' - 'A'] = mMatchScore; // B - C
- mScoringMatrix['C' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['G' - 'A'] = mMatchScore; // B - G
- mScoringMatrix['G' - 'A']['B' - 'A'] = mMatchScore;
- mScoringMatrix['B' - 'A']['T' - 'A'] = mMatchScore; // B - T
- mScoringMatrix['T' - 'A']['B' - 'A'] = mMatchScore;
-}
-
-// enables homo-polymer scoring
-void CSmithWatermanGotoh::EnableHomoPolymerGapPenalty(float hpGapOpenPenalty) {
- mUseHomoPolymerGapOpenPenalty = true;
- mHomoPolymerGapOpenPenalty = hpGapOpenPenalty;
-}
-
-// enables entropy-based gap open penalty
-void CSmithWatermanGotoh::EnableEntropyGapPenalty(float enGapOpenPenalty) {
- mUseEntropyGapOpenPenalty = true;
- mEntropyGapOpenPenalty = enGapOpenPenalty;
-}
-
-// enables repeat-aware gap extension penalty
-void CSmithWatermanGotoh::EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor) {
- mUseRepeatGapExtensionPenalty = true;
- mRepeatGapExtensionPenalty = rGapExtensionPenalty;
- mMaxRepeatGapExtensionPenalty = rMaxGapRepeatExtensionPenaltyFactor * rGapExtensionPenalty;
-}
-
-// corrects the homopolymer gap order for forward alignments
-void CSmithWatermanGotoh::CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches) {
-
- // this is only required for alignments with mismatches
- //if(al.NumMismatches == 0) return;
- if ( numMismatches == 0 ) return;
-
- // localize the alignment data
- //char* pReference = al.Reference.Data();
- //char* pQuery = al.Query.Data();
- //const unsigned int numBases = al.Reference.Length();
- char* pReference = mReversedAnchor;
- char* pQuery = mReversedQuery;
-
- // initialize
- bool hasReferenceGap = false, hasQueryGap = false;
- char* pNonGapSeq = NULL;
- char* pGapSeq = NULL;
- char nonGapBase = 'J';
-
- // identify gapped regions
- for(unsigned int i = 0; i < numBases; i++) {
-
- // check if the current position is gapped
- hasReferenceGap = false;
- hasQueryGap = false;
-
- if(pReference[i] == GAP) {
- hasReferenceGap = true;
- pNonGapSeq = pQuery;
- pGapSeq = pReference;
- nonGapBase = pQuery[i];
- }
-
- if(pQuery[i] == GAP) {
- hasQueryGap = true;
- pNonGapSeq = pReference;
- pGapSeq = pQuery;
- nonGapBase = pReference[i];
- }
-
- // continue if we don't have any gaps
- if(!hasReferenceGap && !hasQueryGap) continue;
-
- // sanity check
- if(hasReferenceGap && hasQueryGap) {
- printf("ERROR: Found a gap in both the reference sequence and query sequence.\n");
- exit(1);
- }
-
- // find the non-gapped length (forward)
- unsigned short numGappedBases = 0;
- unsigned short nonGapLength = 0;
- unsigned short testPos = i;
- while(testPos < numBases) {
-
- const char gs = pGapSeq[testPos];
- const char ngs = pNonGapSeq[testPos];
-
- bool isPartofHomopolymer = false;
- if(((gs == nonGapBase) || (gs == GAP)) && (ngs == nonGapBase)) isPartofHomopolymer = true;
- if(!isPartofHomopolymer) break;
-
- if(gs == GAP) numGappedBases++;
- else nonGapLength++;
- testPos++;
- }
-
- // fix the gap order
- if(numGappedBases != 0) {
- char* pCurrentSequence = pGapSeq + i;
- memset(pCurrentSequence, nonGapBase, nonGapLength);
- pCurrentSequence += nonGapLength;
- memset(pCurrentSequence, GAP, numGappedBases);
- }
-
- // increment
- i += numGappedBases + nonGapLength - 1;
- }
-}
=====================================
external/vcflib/smithwaterman/SmithWatermanGotoh.h deleted
=====================================
@@ -1,100 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <algorithm>
-#include <memory>
-//#include "Alignment.h"
-#include "Mosaik.h"
-#include <stdio.h>
-#include <string.h>
-#include <sstream>
-#include <string>
-#include "Repeats.h"
-#include "LeftAlign.h"
-
-using namespace std;
-
-#define MOSAIK_NUM_NUCLEOTIDES 26
-#define GAP '-'
-
-class CSmithWatermanGotoh {
-public:
- // constructor
- CSmithWatermanGotoh(float matchScore, float mismatchScore, float gapOpenPenalty, float gapExtendPenalty);
- // destructor
- ~CSmithWatermanGotoh(void);
- // aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
- void Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2);
- // enables homo-polymer scoring
- void EnableHomoPolymerGapPenalty(float hpGapOpenPenalty);
- // enables non-repeat gap open penalty
- void EnableEntropyGapPenalty(float enGapOpenPenalty);
- // enables repeat gap extension penalty
- void EnableRepeatGapExtensionPenalty(float rGapExtensionPenalty, float rMaxGapRepeatExtensionPenaltyFactor = 10);
-private:
- // creates a simple scoring matrix to align the nucleotides and the ambiguity code N
- void CreateScoringMatrix(void);
- // corrects the homopolymer gap order for forward alignments
- void CorrectHomopolymerGapOrder(const unsigned int numBases, const unsigned int numMismatches);
- // returns the maximum floating point number
- static inline float MaxFloats(const float& a, const float& b, const float& c);
- // our simple scoring matrix
- float mScoringMatrix[MOSAIK_NUM_NUCLEOTIDES][MOSAIK_NUM_NUCLEOTIDES];
- // keep track of maximum initialized sizes
- unsigned int mCurrentMatrixSize;
- unsigned int mCurrentAnchorSize;
- unsigned int mCurrentQuerySize;
- unsigned int mCurrentAQSumSize;
- // define our traceback directions
- // N.B. This used to be defined as an enum, but gcc doesn't like being told
- // which storage class to use
- const static char Directions_STOP;
- const static char Directions_LEFT;
- const static char Directions_DIAGONAL;
- const static char Directions_UP;
- // repeat structure determination
- const static int repeat_size_max;
- // define scoring constants
- const float mMatchScore;
- const float mMismatchScore;
- const float mGapOpenPenalty;
- const float mGapExtendPenalty;
- // store the backtrace pointers
- char* mPointers;
- // store the vertical gap sizes - assuming gaps are not longer than 32768 bases long
- short* mSizesOfVerticalGaps;
- // store the horizontal gap sizes - assuming gaps are not longer than 32768 bases long
- short* mSizesOfHorizontalGaps;
- // score if xi aligns to a gap after yi
- float* mQueryGapScores;
- // best score of alignment x1...xi to y1...yi
- float* mBestScores;
- // our reversed alignment
- char* mReversedAnchor;
- char* mReversedQuery;
- // define static constants
- static const float FLOAT_NEGATIVE_INFINITY;
- // toggles the use of the homo-polymer gap open penalty
- bool mUseHomoPolymerGapOpenPenalty;
- // specifies the homo-polymer gap open penalty
- float mHomoPolymerGapOpenPenalty;
- // toggles the use of the entropy gap open penalty
- bool mUseEntropyGapOpenPenalty;
- // specifies the entropy gap open penalty (multiplier)
- float mEntropyGapOpenPenalty;
- // toggles the use of the repeat gap extension penalty
- bool mUseRepeatGapExtensionPenalty;
- // specifies the repeat gap extension penalty
- float mRepeatGapExtensionPenalty;
- // specifies the max repeat gap extension penalty
- float mMaxRepeatGapExtensionPenalty;
-};
-
-// returns the maximum floating point number
-inline float CSmithWatermanGotoh::MaxFloats(const float& a, const float& b, const float& c) {
- float max = 0.0f;
- if(a > max) max = a;
- if(b > max) max = b;
- if(c > max) max = c;
- return max;
-}
=====================================
external/vcflib/smithwaterman/convert.h deleted
=====================================
@@ -1,22 +0,0 @@
-#ifndef __CONVERT_H
-#define __CONVERT_H
-
-#include <sstream>
-
-// converts the string into the specified type, setting r to the converted
-// value and returning true/false on success or failure
-template<typename T>
-bool convert(const std::string& s, T& r) {
- std::istringstream iss(s);
- iss >> r;
- return iss.eof() ? true : false;
-}
-
-template<typename T>
-std::string convert(const T& r) {
- std::ostringstream iss;
- iss << r;
- return iss.str();
-}
-
-#endif
=====================================
external/vcflib/smithwaterman/smithwaterman.cpp deleted
=====================================
@@ -1,246 +0,0 @@
-#include <iostream>
-#include <string.h>
-#include <string>
-#include <sstream>
-#include <getopt.h>
-#include <utility>
-#include <vector>
-#include <stdlib.h>
-#include "SmithWatermanGotoh.h"
-#include "BandedSmithWaterman.h"
-
-using namespace std;
-
-void printSummary(void) {
- cerr << "usage: smithwaterman [options] <reference sequence> <query sequence>" << endl
- << endl
- << "options:" << endl
- << " -m, --match-score the match score (default 10.0)" << endl
- << " -n, --mismatch-score the mismatch score (default -9.0)" << endl
- << " -g, --gap-open-penalty the gap open penalty (default 15.0)" << endl
- << " -z, --entropy-gap-open-penalty enable entropy scaling of the gap open penalty" << endl
- << " -e, --gap-extend-penalty the gap extend penalty (default 6.66)" << endl
- << " -r, --repeat-gap-extend-penalty use repeat information when generating gap extension penalties" << endl
- << " -b, --bandwidth bandwidth to use (default 0, or non-banded algorithm)" << endl
- << " -p, --print-alignment print out the alignment" << endl
- << endl
- << "When called with literal reference and query sequences, smithwaterman" << endl
- << "prints the cigar match positional string and the match position for the" << endl
- << "query sequence against the reference sequence." << endl;
-}
-
-
-int main (int argc, char** argv) {
-
- int c;
-
- string reference;
- string query;
-
- int bandwidth = 0;
-
- float matchScore = 10.0f;
- float mismatchScore = -9.0f;
- float gapOpenPenalty = 15.0f;
- float gapExtendPenalty = 6.66f;
- float entropyGapOpenPenalty = 0.0f;
- bool useRepeatGapExtendPenalty = false;
- float repeatGapExtendPenalty = 1.0f;
-
- bool print_alignment = false;
-
- while (true) {
- static struct option long_options[] =
- {
- {"help", no_argument, 0, 'h'},
- {"match-score", required_argument, 0, 'm'},
- {"mismatch-score", required_argument, 0, 'n'},
- {"gap-open-penalty", required_argument, 0, 'g'},
- {"entropy-gap-open-penalty", required_argument, 0, 'z'},
- {"gap-extend-penalty", required_argument, 0, 'e'},
- {"repeat-gap-extend-penalty", required_argument, 0, 'r'},
- {"print-alignment", required_argument, 0, 'p'},
- {"bandwidth", required_argument, 0, 'b'},
- {0, 0, 0, 0}
- };
- int option_index = 0;
-
- c = getopt_long (argc, argv, "hpzm:n:g:r:e:b:r:",
- long_options, &option_index);
-
- if (c == -1)
- break;
-
- switch (c)
- {
- case 0:
- /* If this option set a flag, do nothing else now. */
- if (long_options[option_index].flag != 0)
- break;
- printf ("option %s", long_options[option_index].name);
- if (optarg)
- printf (" with arg %s", optarg);
- printf ("\n");
- break;
-
- case 'm':
- matchScore = atof(optarg);
- break;
-
- case 'n':
- mismatchScore = atof(optarg);
- break;
-
- case 'g':
- gapOpenPenalty = atof(optarg);
- break;
-
- case 'z':
- entropyGapOpenPenalty = 1;
- break;
-
- case 'r':
- useRepeatGapExtendPenalty = true;
- repeatGapExtendPenalty = atof(optarg);
- break;
-
- case 'e':
- gapExtendPenalty = atof(optarg);
- break;
-
- case 'b':
- bandwidth = atoi(optarg);
- break;
-
- case 'p':
- print_alignment = true;
- break;
-
- case 'h':
- printSummary();
- exit(0);
- break;
-
- case '?':
- /* getopt_long already printed an error message. */
- printSummary();
- exit(1);
- break;
-
- default:
- abort ();
- }
- }
-
- /* Print any remaining command line arguments (not options). */
- if (optind == argc - 2) {
- //cerr << "fasta file: " << argv[optind] << endl;
- reference = string(argv[optind]);
- ++optind;
- query = string(argv[optind]);
- } else {
- cerr << "please specify a reference and query sequence" << endl
- << "execute " << argv[0] << " --help for command-line usage" << endl;
- exit(1);
- }
-
- // initialize
-
- unsigned int referencePos;
- string cigar;
-
- // create a new Smith-Waterman alignment object
- if (bandwidth > 0) {
- pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> > hr;
- hr.first.first = 2;
- hr.first.second = 18;
- hr.second.first = 1;
- hr.second.second = 17;
- CBandedSmithWaterman bsw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty, bandwidth);
- bsw.Align(referencePos, cigar, reference, query, hr);
- } else {
- CSmithWatermanGotoh sw(matchScore, mismatchScore, gapOpenPenalty, gapExtendPenalty);
- if (useRepeatGapExtendPenalty)
- sw.EnableRepeatGapExtensionPenalty(repeatGapExtendPenalty);
- if (entropyGapOpenPenalty > 0)
- sw.EnableEntropyGapPenalty(entropyGapOpenPenalty);
- sw.Align(referencePos, cigar, reference, query);
- }
-
- printf("%s %3u\n", cigar.c_str(), referencePos);
-
- // optionally print out the alignment
- if (print_alignment) {
- int alignmentLength = 0;
- int len;
- string slen;
- vector<pair<int, char> > cigarData;
- for (string::iterator c = cigar.begin(); c != cigar.end(); ++c) {
- switch (*c) {
- case 'I':
- len = atoi(slen.c_str());
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'D':
- len = atoi(slen.c_str());
- alignmentLength += len;
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'M':
- len = atoi(slen.c_str());
- alignmentLength += len;
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- case 'S':
- len = atoi(slen.c_str());
- slen.clear();
- cigarData.push_back(make_pair(len, *c));
- break;
- default:
- len = 0;
- slen += *c;
- break;
- }
- }
-
- string gapped_ref = string(reference).substr(referencePos, alignmentLength);
- string gapped_query = string(query);
-
- int refpos = 0;
- int readpos = 0;
- for (vector<pair<int, char> >::iterator c = cigarData.begin(); c != cigarData.end(); ++c) {
- int len = c->first;
- switch (c->second) {
- case 'I':
- gapped_ref.insert(refpos, string(len, '-'));
- readpos += len;
- refpos += len;
- break;
- case 'D':
- gapped_query.insert(readpos, string(len, '-'));
- refpos += len;
- readpos += len;
- break;
- case 'M':
- readpos += len;
- refpos += len;
- break;
- case 'S':
- readpos += len;
- gapped_ref.insert(refpos, string(len, '*'));
- refpos += len;
- break;
- default:
- break;
- }
- }
-
- cout << gapped_ref << endl << gapped_query << endl;
- }
-
- return 0;
-
-}
View it on GitLab: https://salsa.debian.org/med-team/tvc/compare/afcf9f4181f26045d9cabe41d6a48b148641cf9f...f5165aac845c0cc9f6d386b02d06ff3a3cb49119
--
View it on GitLab: https://salsa.debian.org/med-team/tvc/compare/afcf9f4181f26045d9cabe41d6a48b148641cf9f...f5165aac845c0cc9f6d386b02d06ff3a3cb49119
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180927/b5ac569b/attachment-0001.html>
More information about the debian-med-commit
mailing list