[med-svn] [varmatch] 01/02: Imported Upstream version 0+20160708+dfsg
Afif Elghraoui
afif at moszumanska.debian.org
Tue Jul 12 08:22:03 UTC 2016
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository varmatch.
commit 81343a60519e37e9568e7522ead85a890c5f01bd
Author: Afif Elghraoui <afif at ghraoui.name>
Date: Tue Jul 12 00:46:29 2016 -0700
Imported Upstream version 0+20160708+dfsg
---
.gitmodules | 3 +
README.md | 132 ++
drawsth.py | 9 +
examples/chromosome_list.txt | 5 +
filter | 191 ++
lib/__init__.py | 0
lib/binary_search_tree.py | 445 +++++
lib/binary_tree.py | 74 +
lib/linked_binary_tree.py | 196 ++
lib/linked_queue.py | 77 +
lib/map_base.py | 38 +
lib/red_black_tree.py | 112 ++
lib/tree.py | 151 ++
license.txt | 674 +++++++
makefile | 12 +
purify | 71 +
py/lib/__init__.py | 0
py/lib/binary_search_tree.py | 445 +++++
py/lib/binary_tree.py | 74 +
py/lib/linked_binary_tree.py | 196 ++
py/lib/linked_queue.py | 77 +
py/lib/map_base.py | 38 +
py/lib/red_black_tree.py | 112 ++
py/lib/tree.py | 151 ++
py/vcfcompare.py | 1098 +++++++++++
py/vcfcompare_backup.py | 677 +++++++
script/add_marker.py | 0
script/compare_match.py | 44 +
script/count_decomposed_matching.py | 28 +
script/direct_match.py | 32 +
script/filter_hc.py | 120 ++
script/filter_lcr.py | 120 ++
script/overlap.py | 147 ++
script/overlap_direct.py | 138 ++
script/varmatch | 484 +++++
src/diploid.cpp | 3562 +++++++++++++++++++++++++++++++++++
src/diploid.h | 342 ++++
src/diploidvariant.h | 117 ++
src/filter_cv.cpp | 245 +++
src/filter_hc.cpp | 158 ++
src/makefile | 20 +
src/removeduplicate.cpp | 456 +++++
src/removeduplicate.h | 31 +
src/splitvcf.cpp | 30 +
src/splitvcf.h | 15 +
src/test.py | 1 +
src/threadguard.cpp | 9 +
src/threadguard.h | 17 +
src/util.cpp | 20 +
src/util.h | 54 +
src/vcf.cpp | 1230 ++++++++++++
src/vcf.h | 210 +++
src/vm.cpp | 233 +++
src/wholegenome.cpp | 3341 ++++++++++++++++++++++++++++++++
src/wholegenome.h | 367 ++++
src/wholegenome_backup.cpp | 2056 ++++++++++++++++++++
src/wholegenome_backup.h | 274 +++
src/wholegenome_working.cpp | 2471 ++++++++++++++++++++++++
src/wholegenome_working.h | 292 +++
stat | 19 +
vardiff | 299 +++
varmatch | 587 ++++++
xx.png | Bin 0 -> 27349 bytes
63 files changed, 22627 insertions(+)
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..37dabe4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vt"]
+ path = vt
+ url = https://github.com/atks/vt.git
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8eebf9a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,132 @@
+# VarMatch
+robust matching of small variant datasets using flexible scoring schemes
+
+# Authors
+- Chen Sun (The Pennsylvania State University)
+- Paul Medvedev (The Pennsylvania State University)
+
+# Release Date
+### TBA
+Any questions about VarMatch, please email to chensun at cse dot psu dot edu.
+
+If you identify a bug in VarMatch, please either reported on 'github Issues' of VarMatch, or email directly to chensun at cse dot psu dot edu.
+
+
+
+# Prerequisite
+- GCC 4.7 or later for c++11 support
+- Python 2.7 or later
+- matplotlib*
+
+> *matplotlib is only used for graphic visualization. you can use '-G' parameter to disable visualization function
+
+> *matplotlib is not a prerequisite if either `-f`, `-G` or `-C` parameter is used
+
+# Installation
+**Quick Install Instruction:**
+You can build VarMatch from source.
+```
+git clone https://github.com/medvedevgroup/varmatch.git
+cd varmatch
+make all
+```
+
+# Usage
+### Quick Usage:
+
+*compare two vcf files to match variants*
+
+```
+./varmatch -b baseline.vcf -q query.vcf -g ref.fa -o out -f
+```
+- `-b` baseline vcf file
+- `-q` query vcf file
+- `-g` genome fasta file
+- `-o` output file prefix, default value is `out`
+- `-f` fast mode*, equivalent to use parameters `-u 0 -m 0 -s 0 -C`
+
+>*fast mode is suggested for ordinary analysis
+
+### Detail Usage
+
+```
+./varmatch -g <file> -b <file> -q <file> [-o <string>] [-t <int>] [-u <0|1>]
+ [-m <0|1>] [-s <0|1|2|3>] [-h] [-G] [-C] [-f]
+```
+
+Where:
+
+ `-g` <file>, `--genome_sequence` <file>
+ (required) genome sequence FASTA filename
+
+ `-b` <file>, `--baseline` <file>
+ (required) baseline variant VCF filename
+
+ `-q` <file>, `--query` <file>
+ (required) query variant VCF filename
+
+ `-o` <string>, `--output_prefix` <string>
+ output filename prefix, default is "out"
+
+ `-t` <int>, `--thread_num` <int>
+ number of threads, default is the number of available cores.
+
+ If larger than number of available cores or less than 1, automatically
+ set to default value
+
+ `-u` <0|1>, `--score_unit` <0|1>
+ scoring function/score unit: (Default: 0)
+
+ 0 : the score that a VCF entry contributes is 1.
+
+ 1 : the score that a VCF entry contributes is the edit distance
+ between the new allele and the reference one.
+
+
+ `-m` <0|1>, `--match_mode` <0|1>
+ matching mode: (Default: 0)
+
+ 0 : a set of query entries match a set of baseline entries if, for
+ each entry, we can select one of the alleles such that the inferred
+ sequences are identical
+
+ 1 : a set of query entries match a set of baseline entries if there
+ exist a phasing of each set such that the two inferred haplotypes from
+ the query are equal to the two inferred haplotypes from the
+ baseline.
+
+
+ `-s` <0|1|2|3>, `--score_scheme` <0|1|2|3>
+ scoring scheme: (Default: 0)
+
+ 0 : find two subsets of non-overlapping equivalent variants such that
+ the score of the matched variants is maximized (Default)
+
+ 1 : find two subsets of non-overlapping equivalent variants such that
+ the score of the chosen baseline variants is maximized
+
+ 2 : find a maximum scoring set of variants in the query such that each
+ variant can be matched by a subset of the baseline variants
+
+ 3 : (1 to 1 direct match) find a maximum scoring set of entry pairs
+ such that each entry pair contains one query and one baseline variant
+ that result in the same sequence. In this scheme, different scoring
+ functions and matching mode have no difference.
+
+
+ `-G`, `--no_graph`
+ disable graphic module
+ `-C`, `--disable_curves`
+ disable Precision-Recall curves, if use -G or --no_graph, then
+ automatically disable these curves
+ `-f`, `--fast_mode`
+ In this mode, automatically disable graphic module and precision-
+ recall curves, only performs one matching criterion.
+ Fast mode is equivalent to use following parameters compulsively: -G
+ -u 0 -m 0 -s 0
+
+
+### Help Information:
+
+use `-h/--help` for detailed help message.
+
diff --git a/drawsth.py b/drawsth.py
new file mode 100644
index 0000000..c4c2298
--- /dev/null
+++ b/drawsth.py
@@ -0,0 +1,9 @@
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+t = np.arange(0., 5., 0.2)
+plt.plot(t,t,'r-o')
+plt.plot(t,t/2, 'g-^')
+plt.savefig('xx.png')
diff --git a/examples/chromosome_list.txt b/examples/chromosome_list.txt
new file mode 100644
index 0000000..6484ba2
--- /dev/null
+++ b/examples/chromosome_list.txt
@@ -0,0 +1,5 @@
+1 /home/varmatch/human/chr1.fa
+2 /home/varmatch/human/chr2.fa
+17 /home/varmatch/human/backup/chr17.fa
+X /home/varmatch/human/chrxx.fa
+Y /home/anotherpath/human/chrY/human.y.fa
\ No newline at end of file
diff --git a/filter b/filter
new file mode 100755
index 0000000..15f5a3f
--- /dev/null
+++ b/filter
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from sys import argv
+import argparse
+import math
+import scipy.stats as stats
+
+citation = 'Please cite our paper'
+
+parser = argparse.ArgumentParser(epilog=citation)
+parser.add_argument('--qu', metavar='N', help='quality number(QUAL) threshold >= N (default: N=30)', default=30)
+parser.add_argument('--ab', metavar='N', help='allele balance(AB) threshold <= N%% (default: N=20)', default=20)
+parser.add_argument('--fs', metavar='N', help='Fisher strand P-vale <= N (default: N=0.001)', default=0.001)
+parser.add_argument('--rd', metavar='N', default=65,
+ help="average read depth=N, maximum read depth(MD) threshold >= N+4*sqrt(N) (default: N=65),"
+ " use --rd 0 to disable MD filter")
+parser.add_argument('-i', metavar='input.vcf', help='input VCF file')
+parser.add_argument('-o', metavar='output.vcf', help='output VCF file name(default: output.vcf)', default='output.vcf')
+parser.add_argument('--homo', action='store_true', help='filter out homozygous variants')
+parser.add_argument('--nf', action='store_true', help="no filters used in Heng Li review")
+parser.add_argument('--snp', action='store_true', help="only want SNPs")
+parser.add_argument('--indel', action='store_true', help='only want INDELs')
+args = parser.parse_args()
+
+
+def main():
+ if len(argv) < 2:
+ parser.print_help()
+ exit()
+
+ filter_homo = args.homo
+
+ if not filter_homo:
+ print ('Warning: compulsively filter out homozygous variants :)')
+ filter_homo = True
+
+ md = 0 # maximum depth filter
+ if args.rd != 0:
+ md = args.rd + 4 * math.sqrt(args.rd)
+ else:
+ print ('Warning: maximum depth(MD) filter is disabled because read depth = 0')
+
+ output_file = open(args.o, 'w')
+
+ with open(args.i) as input_file:
+ for line in input_file.readlines():
+ qu_fail = False
+ ab_fail = False
+ fs_fail = False
+ md_fail = False
+ if line.startswith('#'):
+ output_file.write(line)
+ continue
+ columns = line.split('\t')
+ if len(columns) < 8:
+ print ('Warning: current variant does not contains enough info for filtering')
+ continue
+
+ ab_contain = False
+ ab_pass = True
+ two_alleles = False
+ pv = 1.0
+ rd = -1
+ srf = -1
+ srr = -1
+ saf_list = []
+ sar_list = []
+ alt = columns[4]
+ if ',' in alt:
+ two_alleles = True
+
+ ref = columns[3]
+ is_indel = False
+ for a in alt.split(','):
+ if len(ref) != len(a):
+ is_indel = True
+
+ if args.snp and is_indel:
+ continue
+ if args.indel and not is_indel:
+ continue
+
+ # Filter out homozygous
+ if filter_homo:
+ if len(columns) < 10:
+ print('Warning: variant does not contain enough info to filter homozygous variants')
+ format_col = columns[8].split(':')
+ gt_index = -1
+ for i in range(len(format_col)):
+ if format_col[i] == 'GT':
+ gt_index = i
+ if gt_index == -1:
+ print ('Warning: variant does not contain genotype info')
+ continue
+ val_col = columns[9].split(':')
+ gt_val = val_col[gt_index]
+ gt_col = []
+ if '/' in gt_val:
+ gt_col = gt_val.split('/')
+ elif '|' in gt_val:
+ gt_col = gt_val.split('|')
+ else:
+ print ('Warning: unrecognized genotype info')
+ continue
+ if gt_col[0] == gt_col[1]:
+ continue
+
+ if args.nf:
+ output_file.write(line)
+ continue
+
+ quality_num = float(columns[5])
+ # quality filter(QU)
+ if quality_num < args.qu:
+ qu_fail = True
+
+ if not qu_fail:
+ output_file.write(line)
+ continue
+
+ info_col = columns[7].split(';')
+ for info in info_col:
+ val_col = info.split('=')
+ info_name = val_col[0]
+ info_val = val_col[1]
+ if info_name == 'AB':
+ ab_contain = True
+ if two_alleles:
+ ab_col = info_val.split(',')
+ for ab in ab_col:
+ if float(ab) > args.ab * 0.01:
+ ab_pass = False
+ else:
+ if float(info_val) > args.ab * 0.01:
+ ab_pass = False
+
+ elif info_name == 'DP':
+ rd = int(info_val)
+ elif info_name == 'SRF':
+ srf = int(info_val)
+ elif info_name == 'SRR':
+ srr = int(info_val)
+ elif info_name == 'SAF':
+ if two_alleles:
+ temp_list = info_val.split(',')
+ saf_list = [int(temp_list[0]), int(temp_list[1])]
+ else:
+ saf_list = [int(info_val)]
+ elif info_name == 'SAR':
+ if two_alleles:
+ temp_list = info_val.split(',')
+ sar_list = [int(temp_list[0]), int(temp_list[1])]
+ else:
+ sar_list = [int(info_val)]
+
+ # AB filter
+ if not ab_contain or not ab_pass:
+ ab_fail = True
+
+ if not ab_fail:
+ output_file.write(line)
+ continue
+
+ # Maximum depth(MD) filter
+ if rd == -1:
+ print ('Warning: current variant does not contain read depth info')
+ continue
+ elif rd < md:
+ md_fail = True
+
+ if not md_fail:
+ output_file.write(line)
+ continue
+ # Fisher strand filter(FS)
+
+ oddsratio, pv = stats.fisher_exact([[srf, srr], [saf_list[0], sar_list[0]]])
+ if pv > args.fs:
+ fs_fail = True
+
+ if two_alleles:
+ oddsratio, pv = stats.fisher_exact([[srf, srr], [saf_list[1], sar_list[1]]])
+ if pv > args.fs:
+ fs_fail = True
+
+ if not fs_fail:
+ output_file.write(line)
+ continue
+
+ output_file.close()
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lib/binary_search_tree.py b/lib/binary_search_tree.py
new file mode 100644
index 0000000..d5a4c17
--- /dev/null
+++ b/lib/binary_search_tree.py
@@ -0,0 +1,445 @@
+# Copyright 2015, Chen Sun
+#
+# Based on source code copyright by 2013, Michael H. Goldwasser
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.linked_binary_tree import LinkedBinaryTree
+from lib.map_base import MapBase
+import copy
+
+class TreeMap(LinkedBinaryTree, MapBase):
+ """Sorted map implementation using a binary search tree."""
+
+ #---------------------------- override Position class ----------------------------
+ class Position(LinkedBinaryTree.Position):
+ def key(self):
+ """Return key of map's key-value pair."""
+ return self.element()._key
+
+ def value(self):
+ """Return value of map's key-value pair."""
+ return self.element()._value
+
+ #------------------------------- nonpublic utilities -------------------------------
+ def _subtree_search(self, p, k):
+ """Return Position of p's subtree having key k, or last node searched."""
+ #print(k)
+ if k == p.key(): # found match
+ return p
+ elif k < p.key(): # search left subtree
+ if self.left(p) is not None:
+ return self._subtree_search(self.left(p), k)
+ else: # search right subtree
+ if self.right(p) is not None:
+ return self._subtree_search(self.right(p), k)
+ return p # unsuccessful search
+
+ #create a subtree_search help function
+ def _search_trace(self, p, k):
+ """Return all the Position that has been searched."""
+ yield p
+ while p is not None and k != p.key():
+ if k < p.key():
+ p = self.left(p)
+ yield p
+ else:
+ p = self.right(p)
+ yield p
+
+ def _subtree_first_position(self, p):
+ """Return Position of first item in subtree rooted at p."""
+ walk = p
+ while self.left(walk) is not None: # keep walking left
+ walk = self.left(walk)
+ return walk
+
+ def _subtree_last_position(self, p):
+ """Return Position of last item in subtree rooted at p."""
+ walk = p
+ while self.right(walk) is not None: # keep walking right
+ walk = self.right(walk)
+ return walk
+
+ #--------------------- public methods providing "positional" support ---------------------
+ def first(self):
+ """Return the first Position in the tree (or None if empty)."""
+ return self._subtree_first_position(self.root()) if len(self) > 0 else None
+
+ def last(self):
+ """Return the last Position in the tree (or None if empty)."""
+ return self._subtree_last_position(self.root()) if len(self) > 0 else None
+
+ def before(self, p):
+ """Return the Position just before p in the natural order.
+
+ Return None if p is the first position.
+ """
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.left(p):
+ return self._subtree_last_position(self.left(p))
+ else:
+ # walk upward
+ walk = p
+ above = self.parent(walk)
+ while above is not None and walk == self.left(above):
+ walk = above
+ above = self.parent(walk)
+ return above
+
+ def after(self, p):
+ """Return the Position just after p in the natural order.
+
+ Return None if p is the last position.
+ """
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.right(p):
+ return self._subtree_first_position(self.right(p))
+ else:
+ walk = p
+ above = self.parent(walk)
+ while above is not None and walk == self.right(above):
+ walk = above
+ above = self.parent(walk)
+ return above
+
+ def find_position(self, k):
+ """Return position with key k, or else neighbor (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self._subtree_search(self.root(), k)
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ return p
+
+ def find_nearest(self, k):
+ """Return position with key k, or else the nearest position k' (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ #print(p.key(), abs(p.key()-k), shortest_distance)
+ abs_distance = abs(p.key() - k)
+ if abs_distance < shortest_distance:
+ shortest_distance = abs_distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+ def find_nearest_small(self, k):
+ """Return position with key k, or else the nearest position with k' < k (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ distance = k - p.key()
+ if distance >= 0 and distance < shortest_distance:
+ shortest_distance = distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+ def find_nearest_large(self, k):
+ """Return position with key k, or else the nearest position with k' > k (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ distance = p.key()-k
+ if distance >= 0 and distance < shortest_distance:
+ shortest_distance = distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+
+ def delete(self, p):
+ """Remove the item at given Position."""
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.left(p) and self.right(p): # p has two children
+ replacement = self._subtree_last_position(self.left(p))
+ self._replace(p, replacement.element()) # from LinkedBinaryTree
+ p = replacement
+ # now p has at most one child
+ parent = self.parent(p)
+ self._delete(p) # inherited from LinkedBinaryTree
+ self._rebalance_delete(parent) # if root deleted, parent is None
+
+
+ def keys(self):
+ key_list = []
+ p = self.first()
+ while p is not None:
+ key_list.append(p.key())
+ p = self.after(p)
+ return key_list
+
+ #--------------------- public methods for (standard) map interface ---------------------
+ def __getitem__(self, k):
+ """Return value associated with key k (raise KeyError if not found)."""
+ if self.is_empty():
+ raise KeyError('Key Error: ' + repr(k))
+ else:
+ p = self._subtree_search(self.root(), k)
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ if k != p.key():
+ raise KeyError('Key Error: ' + repr(k))
+ return p.value()
+
+ def __setitem__(self, k, v):
+ """Assign value v to key k, overwriting existing value if present."""
+ if self.is_empty():
+ leaf = self._add_root(self._Item(k,v)) # from LinkedBinaryTree
+ else:
+ p = self._subtree_search(self.root(), k)
+ if p.key() == k:
+ p.element()._value = v # replace existing item's value
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ return
+ else:
+ item = self._Item(k,v)
+ if p.key() < k:
+ leaf = self._add_right(p, item) # inherited from LinkedBinaryTree
+ else:
+ leaf = self._add_left(p, item) # inherited from LinkedBinaryTree
+ self._rebalance_insert(leaf) # hook for balanced tree subclasses
+
+ def __delitem__(self, k):
+ """Remove item associated with key k (raise KeyError if not found)."""
+ if not self.is_empty():
+ p = self._subtree_search(self.root(), k)
+ if k == p.key():
+ self.delete(p) # rely on positional version
+ return # successful deletion complete
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ raise KeyError('Key Error: ' + repr(k))
+
+ def __iter__(self):
+ """Generate an iteration of all keys in the map in order."""
+ p = self.first()
+ while p is not None:
+ yield p.key()
+ p = self.after(p)
+
+ #--------------------- public methods for sorted map interface ---------------------
+ def __reversed__(self):
+ """Generate an iteration of all keys in the map in reverse order."""
+ p = self.last()
+ while p is not None:
+ yield p.key()
+ p = self.before(p)
+
+ def find_min(self):
+ """Return (key,value) pair with minimum key (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self.first()
+ return (p.key(), p.value())
+
+ def find_max(self):
+ """Return (key,value) pair with maximum key (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self.last()
+ return (p.key(), p.value())
+
+ def find_le(self, k):
+ """Return (key,value) pair with greatest key less than or equal to k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if k < p.key():
+ p = self.before(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_lt(self, k):
+ """Return (key,value) pair with greatest key strictly less than k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if not p.key() < k:
+ p = self.before(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_ge(self, k):
+ """Return (key,value) pair with least key greater than or equal to k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k) # may not find exact match
+ if p.key() < k: # p's key is too small
+ p = self.after(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_gt(self, k):
+ """Return (key,value) pair with least key strictly greater than k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if not k < p.key():
+ p = self.after(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def linear_range_search(self, position, start, stop):
+ """
+ Iterate all position such that start < position.key < stop
+ Mind: linear_search function only return Position, not key value pair.
+
+ If start is None, searching begins from self.first()
+ If start is None, iteration begins with minimum key of map.
+ If end is None, iteration continues through the maximum key of map.
+ """
+ if not self.is_empty():
+ if position is not None:
+ p = position
+ else:
+ p = self.first()
+ while p is not None and (stop is None or p.key() < stop):
+ if p.key() >= start:
+ yield(p)
+ p = self.after(p)
+
+ def find_range(self, start, stop):
+ """Iterate all (key,value) pairs such that start <= key < stop.
+
+ If start is None, iteration begins with minimum key of map.
+ If stop is None, iteration continues through the maximum key of map.
+ """
+ if not self.is_empty():
+ if start is None:
+ p = self.first()
+ else:
+ # we initialize p with logic similar to find_ge
+ p = self.find_position(start)
+ if p.key() < start:
+ p = self.after(p)
+ while p is not None and (stop is None or p.key() < stop):
+ yield (p.key(), p.value())
+ p = self.after(p)
+
+ #--------------------- hooks used by subclasses to balance a tree ---------------------
+ def _rebalance_insert(self, p):
+ """Call to indicate that position p is newly added."""
+ pass
+
+ def _rebalance_delete(self, p):
+ """Call to indicate that a child of p has been removed."""
+ pass
+
+ def _rebalance_access(self, p):
+ """Call to indicate that position p was recently accessed."""
+ pass
+
+ #--------------------- nonpublic methods to support tree balancing ---------------------
+
+ def _relink(self, parent, child, make_left_child):
+ """Relink parent node with child node (we allow child to be None)."""
+ if make_left_child: # make it a left child
+ parent._left = child
+ else: # make it a right child
+ parent._right = child
+ if child is not None: # make child point to parent
+ child._parent = parent
+
+ def _rotate(self, p):
+ """Rotate Position p above its parent.
+
+ Switches between these configurations, depending on whether p==a or p==b.
+
+ b a
+ / \ / \
+ a t2 t0 b
+ / \ / \
+ t0 t1 t1 t2
+
+ Caller should ensure that p is not the root.
+ """
+ """Rotate Position p above its parent."""
+ x = p._node
+ y = x._parent # we assume this exists
+ z = y._parent # grandparent (possibly None)
+ if z is None:
+ self._root = x # x becomes root
+ x._parent = None
+ else:
+ self._relink(z, x, y == z._left) # x becomes a direct child of z
+ # now rotate x and y, including transfer of middle subtree
+ if x == y._left:
+ self._relink(y, x._right, True) # x._right becomes left child of y
+ self._relink(x, y, False) # y becomes right child of x
+ else:
+ self._relink(y, x._left, False) # x._left becomes right child of y
+ self._relink(x, y, True) # y becomes left child of x
+
+ def _restructure(self, x):
+ """Perform a trinode restructure among Position x, its parent, and its grandparent.
+
+ Return the Position that becomes root of the restructured subtree.
+
+ Assumes the nodes are in one of the following configurations:
+
+ z=a z=c z=a z=c
+ / \ / \ / \ / \
+ t0 y=b y=b t3 t0 y=c y=a t3
+ / \ / \ / \ / \
+ t1 x=c x=a t2 x=b t3 t0 x=b
+ / \ / \ / \ / \
+ t2 t3 t0 t1 t1 t2 t1 t2
+
+ The subtree will be restructured so that the node with key b becomes its root.
+
+ b
+ / \
+ a c
+ / \ / \
+ t0 t1 t2 t3
+
+ Caller should ensure that x has a grandparent.
+ """
+ """Perform trinode restructure of Position x with parent/grandparent."""
+ y = self.parent(x)
+ z = self.parent(y)
+ if (x == self.right(y)) == (y == self.right(z)): # matching alignments
+ self._rotate(y) # single rotation (of y)
+ return y # y is new subtree root
+ else: # opposite alignments
+ self._rotate(x) # double rotation (of x)
+ self._rotate(x)
+ return x # x is new subtree root
diff --git a/lib/binary_tree.py b/lib/binary_tree.py
new file mode 100644
index 0000000..56ae9e7
--- /dev/null
+++ b/lib/binary_tree.py
@@ -0,0 +1,74 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.tree import Tree
+
+class BinaryTree(Tree):
+ """Abstract base class representing a binary tree structure."""
+
+ # --------------------- additional abstract methods ---------------------
+ def left(self, p):
+ """Return a Position representing p's left child.
+
+ Return None if p does not have a left child.
+ """
+ raise NotImplementedError('must be implemented by subclass')
+
+ def right(self, p):
+ """Return a Position representing p's right child.
+
+ Return None if p does not have a right child.
+ """
+ raise NotImplementedError('must be implemented by subclass')
+
+ # ---------- concrete methods implemented in this class ----------
+ def sibling(self, p):
+ """Return a Position representing p's sibling (or None if no sibling)."""
+ parent = self.parent(p)
+ if parent is None: # p must be the root
+ return None # root has no sibling
+ else:
+ if p == self.left(parent):
+ return self.right(parent) # possibly None
+ else:
+ return self.left(parent) # possibly None
+
+ def children(self, p):
+ """Generate an iteration of Positions representing p's children."""
+ if self.left(p) is not None:
+ yield self.left(p)
+ if self.right(p) is not None:
+ yield self.right(p)
+
+ def inorder(self):
+ """Generate an inorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_inorder(self.root()):
+ yield p
+
+ def _subtree_inorder(self, p):
+ """Generate an inorder iteration of positions in subtree rooted at p."""
+ if self.left(p) is not None: # if left child exists, traverse its subtree
+ for other in self._subtree_inorder(self.left(p)):
+ yield other
+ yield p # visit p between its subtrees
+ if self.right(p) is not None: # if right child exists, traverse its subtree
+ for other in self._subtree_inorder(self.right(p)):
+ yield other
+
+ # override inherited version to make inorder the default
+ def positions(self):
+ """Generate an iteration of the tree's positions."""
+ return self.inorder() # make inorder the default
diff --git a/lib/linked_binary_tree.py b/lib/linked_binary_tree.py
new file mode 100644
index 0000000..a6cc58b
--- /dev/null
+++ b/lib/linked_binary_tree.py
@@ -0,0 +1,196 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.binary_tree import BinaryTree
+
+#ch8
+class LinkedBinaryTree(BinaryTree):
+ """Linked representation of a binary tree structure."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node:
+ """Lightweight, nonpublic class for storing a node."""
+ __slots__ = '_element', '_parent', '_left', '_right' # streamline memory usage
+
+ def __init__(self, element, parent=None, left=None, right=None):
+ self._element = element
+ self._parent = parent
+ self._left = left
+ self._right = right
+
+ #-------------------------- nested Position class --------------------------
+ class Position(BinaryTree.Position):
+ """An abstraction representing the location of a single element."""
+
+ def __init__(self, container, node):
+ """Constructor should not be invoked by user."""
+ self._container = container
+ self._node = node
+
+ def element(self):
+ """Return the element stored at this Position."""
+ return self._node._element
+
+ def __eq__(self, other):
+ """Return True if other is a Position representing the same location."""
+ return type(other) is type(self) and other._node is self._node
+
+ #------------------------------- utility methods -------------------------------
+ def _validate(self, p):
+ """Return associated node, if position is valid."""
+ if not isinstance(p, self.Position):
+ raise TypeError('p must be proper Position type')
+ if p._container is not self:
+ raise ValueError('p does not belong to this container')
+ if p._node._parent is p._node: # convention for deprecated nodes
+ raise ValueError('p is no longer valid')
+ return p._node
+
+ def _make_position(self, node):
+ """Return Position instance for given node (or None if no node)."""
+ return self.Position(self, node) if node is not None else None
+
+ #-------------------------- binary tree constructor --------------------------
+ def __init__(self):
+ """Create an initially empty binary tree."""
+ self._root = None
+ self._size = 0
+
+ #-------------------------- public accessors --------------------------
+ def __len__(self):
+ """Return the total number of elements in the tree."""
+ return self._size
+
+ def root(self):
+ """Return the root Position of the tree (or None if tree is empty)."""
+ return self._make_position(self._root)
+
+ def parent(self, p):
+ """Return the Position of p's parent (or None if p is root)."""
+ node = self._validate(p)
+ return self._make_position(node._parent)
+
+ def left(self, p):
+ """Return the Position of p's left child (or None if no left child)."""
+ node = self._validate(p)
+ return self._make_position(node._left)
+
+ def right(self, p):
+ """Return the Position of p's right child (or None if no right child)."""
+ node = self._validate(p)
+ return self._make_position(node._right)
+
+ def num_children(self, p):
+ """Return the number of children of Position p."""
+ node = self._validate(p)
+ count = 0
+ if node._left is not None: # left child exists
+ count += 1
+ if node._right is not None: # right child exists
+ count += 1
+ return count
+
+ #-------------------------- nonpublic mutators --------------------------
+ def _add_root(self, e):
+ """Place element e at the root of an empty tree and return new Position.
+
+ Raise ValueError if tree nonempty.
+ """
+ if self._root is not None:
+ raise ValueError('Root exists')
+ self._size = 1
+ self._root = self._Node(e)
+ return self._make_position(self._root)
+
+ def _add_left(self, p, e):
+ """Create a new left child for Position p, storing element e.
+
+ Return the Position of new node.
+ Raise ValueError if Position p is invalid or p already has a left child.
+ """
+ node = self._validate(p)
+ if node._left is not None:
+ raise ValueError('Left child exists')
+ self._size += 1
+ node._left = self._Node(e, node) # node is its parent
+ return self._make_position(node._left)
+
+ def _add_right(self, p, e):
+ """Create a new right child for Position p, storing element e.
+
+ Return the Position of new node.
+ Raise ValueError if Position p is invalid or p already has a right child.
+ """
+ node = self._validate(p)
+ if node._right is not None:
+ raise ValueError('Right child exists')
+ self._size += 1
+ node._right = self._Node(e, node) # node is its parent
+ return self._make_position(node._right)
+
+ def _replace(self, p, e):
+ """Replace the element at position p with e, and return old element."""
+ node = self._validate(p)
+ old = node._element
+ node._element = e
+ return old
+
+ def _delete(self, p):
+ """Delete the node at Position p, and replace it with its child, if any.
+
+ Return the element that had been stored at Position p.
+ Raise ValueError if Position p is invalid or p has two children.
+ """
+ node = self._validate(p)
+ if self.num_children(p) == 2:
+ raise ValueError('Position has two children')
+ child = node._left if node._left else node._right # might be None
+ if child is not None:
+ child._parent = node._parent # child's grandparent becomes parent
+ if node is self._root:
+ self._root = child # child becomes root
+ else:
+ parent = node._parent
+ if node is parent._left:
+ parent._left = child
+ else:
+ parent._right = child
+ self._size -= 1
+ node._parent = node # convention for deprecated node
+ return node._element
+
+ def _attach(self, p, t1, t2):
+ """Attach trees t1 and t2, respectively, as the left and right subtrees of the external Position p.
+
+ As a side effect, set t1 and t2 to empty.
+ Raise TypeError if trees t1 and t2 do not match type of this tree.
+ Raise ValueError if Position p is invalid or not external.
+ """
+ node = self._validate(p)
+ if not self.is_leaf(p):
+ raise ValueError('position must be leaf')
+ if not type(self) is type(t1) is type(t2): # all 3 trees must be same type
+ raise TypeError('Tree types must match')
+ self._size += len(t1) + len(t2)
+ if not t1.is_empty(): # attached t1 as left subtree of node
+ t1._root._parent = node
+ node._left = t1._root
+ t1._root = None # set t1 instance to empty
+ t1._size = 0
+ if not t2.is_empty(): # attached t2 as right subtree of node
+ t2._root._parent = node
+ node._right = t2._root
+ t2._root = None # set t2 instance to empty
+ t2._size = 0
diff --git a/lib/linked_queue.py b/lib/linked_queue.py
new file mode 100644
index 0000000..978b35b
--- /dev/null
+++ b/lib/linked_queue.py
@@ -0,0 +1,77 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#from ..exceptions import Empty
+
+#ch7
+class LinkedQueue:
+ """FIFO queue implementation using a singly linked list for storage."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node:
+ """Lightweight, nonpublic class for storing a singly linked node."""
+ __slots__ = '_element', '_next' # streamline memory usage
+
+ def __init__(self, element, next):
+ self._element = element
+ self._next = next
+
+ #------------------------------- queue methods -------------------------------
+ def __init__(self):
+ """Create an empty queue."""
+ self._head = None
+ self._tail = None
+ self._size = 0 # number of queue elements
+
+ def __len__(self):
+ """Return the number of elements in the queue."""
+ return self._size
+
+ def is_empty(self):
+ """Return True if the queue is empty."""
+ return self._size == 0
+
+ def first(self):
+ """Return (but do not remove) the element at the front of the queue.
+
+ Raise Empty exception if the queue is empty.
+ """
+ if self.is_empty():
+ raise Empty('Queue is empty')
+ return self._head._element # front aligned with head of list
+
+ def dequeue(self):
+ """Remove and return the first element of the queue (i.e., FIFO).
+
+ Raise Empty exception if the queue is empty.
+ """
+ if self.is_empty():
+ raise Empty('Queue is empty')
+ answer = self._head._element
+ self._head = self._head._next
+ self._size -= 1
+ if self.is_empty(): # special case as queue is empty
+ self._tail = None # removed head had been the tail
+ return answer
+
+ def enqueue(self, e):
+ """Add an element to the back of queue."""
+ newest = self._Node(e, None) # node will be new tail node
+ if self.is_empty():
+ self._head = newest # special case: previously empty
+ else:
+ self._tail._next = newest
+ self._tail = newest # update reference to tail node
+ self._size += 1
diff --git a/lib/map_base.py b/lib/map_base.py
new file mode 100644
index 0000000..a93c1f7
--- /dev/null
+++ b/lib/map_base.py
@@ -0,0 +1,38 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from collections import MutableMapping
+
+#ch10
+class MapBase(MutableMapping):
+ """Our own abstract base class that includes a nonpublic _Item class."""
+
+ #------------------------------- nested _Item class -------------------------------
+ class _Item:
+ """Lightweight composite to store key-value pairs as map items."""
+ __slots__ = '_key', '_value'
+
+ def __init__(self, k, v):
+ self._key = k
+ self._value = v
+
+ def __eq__(self, other):
+ return self._key == other._key # compare items based on their keys
+
+ def __ne__(self, other):
+ return not (self == other) # opposite of __eq__
+
+ def __lt__(self, other):
+ return self._key < other._key # compare items based on their keys
diff --git a/lib/red_black_tree.py b/lib/red_black_tree.py
new file mode 100644
index 0000000..c5905c7
--- /dev/null
+++ b/lib/red_black_tree.py
@@ -0,0 +1,112 @@
+# Copyright 2015, Chen Sun
+#
+# Based on source code copyright by 2013, Michael H. Goldwasser
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.binary_search_tree import TreeMap
+
+"""
+usage:
+"""
+class RedBlackTreeMap(TreeMap):
+ """Sorted map implementation using a red-black tree."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node(TreeMap._Node):
+ """Node class for red-black tree maintains bit that denotes color."""
+ __slots__ = '_red' # add additional data member to the Node class
+
+ def __init__(self, element, parent=None, left=None, right=None):
+ TreeMap._Node.__init__(self, element, parent, left, right)
+ self._red = True # new node red by default
+
+ #------------------------- positional-based utility methods -------------------------
+ # we consider a nonexistent child to be trivially black
+ def _set_red(self, p): p._node._red = True
+ def _set_black(self, p): p._node._red = False
+ def _set_color(self, p, make_red): p._node._red = make_red
+ def _is_red(self, p): return p is not None and p._node._red
+ def _is_red_leaf(self, p): return self._is_red(p) and self.is_leaf(p)
+
+ def _get_red_child(self, p):
+ """Return a red child of p (or None if no such child)."""
+ for child in (self.left(p), self.right(p)):
+ if self._is_red(child):
+ return child
+ return None
+
+ #------------------------- support for insertions -------------------------
+ def _rebalance_insert(self, p):
+ self._resolve_red(p) # new node is always red
+
+ def _resolve_red(self, p):
+ if self.is_root(p):
+ self._set_black(p) # make root black
+ else:
+ parent = self.parent(p)
+ if self._is_red(parent): # double red problem
+ uncle = self.sibling(parent)
+ if not self._is_red(uncle): # Case 1: misshapen 4-node
+ middle = self._restructure(p) # do trinode restructuring
+ self._set_black(middle) # and then fix colors
+ self._set_red(self.left(middle))
+ self._set_red(self.right(middle))
+ else: # Case 2: overfull 5-node
+ grand = self.parent(parent)
+ self._set_red(grand) # grandparent becomes red
+ self._set_black(self.left(grand)) # its children become black
+ self._set_black(self.right(grand))
+ self._resolve_red(grand) # recur at red grandparent
+
+ #------------------------- support for deletions -------------------------
+ def _rebalance_delete(self, p):
+ if len(self) == 1:
+ self._set_black(self.root()) # special case: ensure that root is black
+ elif p is not None:
+ n = self.num_children(p)
+ if n == 1: # deficit exists unless child is a red leaf
+ c = next(self.children(p))
+ if not self._is_red_leaf(c):
+ self._fix_deficit(p, c)
+ elif n == 2: # removed black node with red child
+ if self._is_red_leaf(self.left(p)):
+ self._set_black(self.left(p))
+ else:
+ self._set_black(self.right(p))
+
+ def _fix_deficit(self, z, y):
+ """Resolve black deficit at z, where y is the root of z's heavier subtree."""
+ if not self._is_red(y): # y is black; will apply Case 1 or 2
+ x = self._get_red_child(y)
+ if x is not None: # Case 1: y is black and has red child x; do "transfer"
+ old_color = self._is_red(z)
+ middle = self._restructure(x)
+ self._set_color(middle, old_color) # middle gets old color of z
+ self._set_black(self.left(middle)) # children become black
+ self._set_black(self.right(middle))
+ else: # Case 2: y is black, but no red children; recolor as "fusion"
+ self._set_red(y)
+ if self._is_red(z):
+ self._set_black(z) # this resolves the problem
+ elif not self.is_root(z):
+ self._fix_deficit(self.parent(z), self.sibling(z)) # recur upward
+ else: # Case 3: y is red; rotate misaligned 3-node and repeat
+ self._rotate(y)
+ self._set_black(y)
+ self._set_red(z)
+ if z == self.right(y):
+ self._fix_deficit(z, self.left(z))
+ else:
+ self._fix_deficit(z, self.right(z))
diff --git a/lib/tree.py b/lib/tree.py
new file mode 100644
index 0000000..921ba67
--- /dev/null
+++ b/lib/tree.py
@@ -0,0 +1,151 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.linked_queue import LinkedQueue #LinkedQueue is only used for bfs
+import collections
+
+#ch8
+class Tree:
+ """Abstract base class representing a tree structure."""
+
+ #------------------------------- nested Position class -------------------------------
+ class Position:
+ """An abstraction representing the location of a single element within a tree.
+
+ Note that two position instaces may represent the same inherent location in a tree.
+ Therefore, users should always rely on syntax 'p == q' rather than 'p is q' when testing
+ equivalence of positions.
+
+ we define a tree ADT using the concept of apositionas an abstraction for a node of a tree
+ """
+
+ def element(self):
+ """Return the element stored at this Position."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __eq__(self, other):
+ """Return True if other Position represents the same location."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __ne__(self, other):
+ """Return True if other does not represent the same location."""
+ return not (self == other) # opposite of __eq__
+
+ # ---------- abstract methods that concrete subclass must support ----------
+ def root(self):
+ """Return Position representing the tree's root (or None if empty)."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def parent(self, p):
+ """Return Position representing p's parent (or None if p is root)."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def num_children(self, p):
+ """Return the number of children that Position p has."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def children(self, p):
+ """Generate an iteration of Positions representing p's children."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __len__(self):
+ """Return the total number of elements in the tree."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ # ---------- concrete methods implemented in this class ----------
+ def is_root(self, p):
+ """Return True if Position p represents the root of the tree."""
+ return self.root() == p
+
+ def is_leaf(self, p):
+ """Return True if Position p does not have any children."""
+ return self.num_children(p) == 0
+
+ def is_empty(self):
+ """Return True if the tree is empty."""
+ return len(self) == 0
+
+ def depth(self, p):
+ """Return the number of levels separating Position p from the root."""
+ if self.is_root(p):
+ return 0
+ else:
+ return 1 + self.depth(self.parent(p))
+
+ def _height1(self): # works, but O(n^2) worst-case time
+ """Return the height of the tree."""
+ return max(self.depth(p) for p in self.positions() if self.is_leaf(p))
+
+ def _height2(self, p): # time is linear in size of subtree
+ """Return the height of the subtree rooted at Position p."""
+ if self.is_leaf(p):
+ return 0
+ else:
+ return 1 + max(self._height2(c) for c in self.children(p))
+
+ def height(self, p=None):
+ """Return the height of the subtree rooted at Position p.
+
+ If p is None, return the height of the entire tree.
+ """
+ if p is None:
+ p = self.root()
+ return self._height2(p) # start _height2 recursion
+
+ def __iter__(self):
+ """Generate an iteration of the tree's elements."""
+ for p in self.positions(): # use same order as positions()
+ yield p.element() # but yield each element
+
+ def positions(self):
+ """Generate an iteration of the tree's positions."""
+ return self.preorder() # return entire preorder iteration
+
+ def preorder(self):
+ """Generate a preorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_preorder(self.root()): # start recursion
+ yield p
+
+ def _subtree_preorder(self, p):
+ """Generate a preorder iteration of positions in subtree rooted at p."""
+ yield p # visit p before its subtrees
+ for c in self.children(p): # for each child c
+ for other in self._subtree_preorder(c): # do preorder of c's subtree
+ yield other # yielding each to our caller
+
+ def postorder(self):
+ """Generate a postorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_postorder(self.root()): # start recursion
+ yield p
+
+ def _subtree_postorder(self, p):
+ """Generate a postorder iteration of positions in subtree rooted at p."""
+ for c in self.children(p): # for each child c
+ for other in self._subtree_postorder(c): # do postorder of c's subtree
+ yield other # yielding each to our caller
+ yield p # visit p after its subtrees
+
+ def breadthfirst(self):
+ """Generate a breadth-first iteration of the positions of the tree."""
+ if not self.is_empty():
+ fringe = LinkedQueue() # known positions not yet yielded
+ fringe.enqueue(self.root()) # starting with the root
+ while not fringe.is_empty():
+ p = fringe.dequeue() # remove from front of the queue
+ yield p # report this position
+ for c in self.children(p):
+ fringe.enqueue(c) # add children to back of queue
diff --git a/license.txt b/license.txt
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..cc0e08a
--- /dev/null
+++ b/makefile
@@ -0,0 +1,12 @@
+all: vm
+.PHONY: all vm clean
+
+vm:
+ $(MAKE) -C src all
+ chmod +x varmatch
+ chmod +x purify
+ chmod +x filter
+
+clean:
+ $(MAKE) -C src clean
+
diff --git a/purify b/purify
new file mode 100755
index 0000000..837cd28
--- /dev/null
+++ b/purify
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+from sys import argv
+import sys
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', '--input', help='original vcf file')
+parser.add_argument('-o', '--output', help='purified vcf file')
+parser.add_argument('-g', '--genome', help='genome FASTA file')
+args = parser.parse_args()
+
+refFilename = args.genome
+vcfFilename = args.input
+purifyFilename = args.output
+
+def read_reference(refFilename):
+ sequence = ''
+ refFile = open(refFilename)
+
+ for line in refFile.readlines():
+ if line.startswith(">"):
+ continue
+ line = line.strip()
+ sequence += line
+ refFile.close()
+ return sequence
+
+def main():
+ print ('\t[input] ' + args.input)
+ print ('\t[genome]' + args.genome)
+
+ vcfFile = open(vcfFilename)
+ purifyFile = open(purifyFilename, "w")
+
+ reference = read_reference(refFilename)
+
+ num = 0
+ heteNum = 0
+
+ for line in vcfFile.readlines():
+ if line.startswith("#"):
+ purifyFile.write(line)
+ continue
+ columns = line.split("\t")
+ chrom = columns[0]
+ pos = int(columns[1]) - 1
+ reservedRef = columns[3]
+ reservedAlt = columns[4]
+ ref = columns[3].upper()
+ alt = columns[4].upper()
+ end = pos + len(ref)
+ refSeq = reference[pos:end].upper()
+ info = columns[7].split(";")[2]
+ infoId = info.split("=")[1]
+
+ if ref != refSeq:
+ num += 1
+ continue
+ else:
+ purifyFile.write(line)
+ vcfFile.close()
+ purifyFile.close()
+ if(num == 0):
+ print ('\t all variants in input vcf file match genome sequence')
+ else:
+ print ('\t[Warning] ' + str(num) + ' variants do not match genome sequence and removed!')
+ print ('\t[output]' + args.output)
+
+if __name__ == '__main__':
+ main()
diff --git a/py/lib/__init__.py b/py/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/py/lib/binary_search_tree.py b/py/lib/binary_search_tree.py
new file mode 100644
index 0000000..d5a4c17
--- /dev/null
+++ b/py/lib/binary_search_tree.py
@@ -0,0 +1,445 @@
+# Copyright 2015, Chen Sun
+#
+# Based on source code copyright by 2013, Michael H. Goldwasser
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.linked_binary_tree import LinkedBinaryTree
+from lib.map_base import MapBase
+import copy
+
+class TreeMap(LinkedBinaryTree, MapBase):
+ """Sorted map implementation using a binary search tree."""
+
+ #---------------------------- override Position class ----------------------------
+ class Position(LinkedBinaryTree.Position):
+ def key(self):
+ """Return key of map's key-value pair."""
+ return self.element()._key
+
+ def value(self):
+ """Return value of map's key-value pair."""
+ return self.element()._value
+
+ #------------------------------- nonpublic utilities -------------------------------
+ def _subtree_search(self, p, k):
+ """Return Position of p's subtree having key k, or last node searched."""
+ #print(k)
+ if k == p.key(): # found match
+ return p
+ elif k < p.key(): # search left subtree
+ if self.left(p) is not None:
+ return self._subtree_search(self.left(p), k)
+ else: # search right subtree
+ if self.right(p) is not None:
+ return self._subtree_search(self.right(p), k)
+ return p # unsuccessful search
+
+ #create a subtree_search help function
+ def _search_trace(self, p, k):
+ """Return all the Position that has been searched."""
+ yield p
+ while p is not None and k != p.key():
+ if k < p.key():
+ p = self.left(p)
+ yield p
+ else:
+ p = self.right(p)
+ yield p
+
+ def _subtree_first_position(self, p):
+ """Return Position of first item in subtree rooted at p."""
+ walk = p
+ while self.left(walk) is not None: # keep walking left
+ walk = self.left(walk)
+ return walk
+
+ def _subtree_last_position(self, p):
+ """Return Position of last item in subtree rooted at p."""
+ walk = p
+ while self.right(walk) is not None: # keep walking right
+ walk = self.right(walk)
+ return walk
+
+ #--------------------- public methods providing "positional" support ---------------------
+ def first(self):
+ """Return the first Position in the tree (or None if empty)."""
+ return self._subtree_first_position(self.root()) if len(self) > 0 else None
+
+ def last(self):
+ """Return the last Position in the tree (or None if empty)."""
+ return self._subtree_last_position(self.root()) if len(self) > 0 else None
+
+ def before(self, p):
+ """Return the Position just before p in the natural order.
+
+ Return None if p is the first position.
+ """
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.left(p):
+ return self._subtree_last_position(self.left(p))
+ else:
+ # walk upward
+ walk = p
+ above = self.parent(walk)
+ while above is not None and walk == self.left(above):
+ walk = above
+ above = self.parent(walk)
+ return above
+
+ def after(self, p):
+ """Return the Position just after p in the natural order.
+
+ Return None if p is the last position.
+ """
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.right(p):
+ return self._subtree_first_position(self.right(p))
+ else:
+ walk = p
+ above = self.parent(walk)
+ while above is not None and walk == self.right(above):
+ walk = above
+ above = self.parent(walk)
+ return above
+
+ def find_position(self, k):
+ """Return position with key k, or else neighbor (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self._subtree_search(self.root(), k)
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ return p
+
+ def find_nearest(self, k):
+ """Return position with key k, or else the nearest position k' (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ #print(p.key(), abs(p.key()-k), shortest_distance)
+ abs_distance = abs(p.key() - k)
+ if abs_distance < shortest_distance:
+ shortest_distance = abs_distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+ def find_nearest_small(self, k):
+ """Return position with key k, or else the nearest position with k' < k (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ distance = k - p.key()
+ if distance >= 0 and distance < shortest_distance:
+ shortest_distance = distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+ def find_nearest_large(self, k):
+ """Return position with key k, or else the nearest position with k' > k (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ shortest_distance = 3000000000
+ nearest_p = None
+ for p in self._search_trace(self.root(), k):
+ if p is not None:
+ distance = p.key()-k
+ if distance >= 0 and distance < shortest_distance:
+ shortest_distance = distance
+ nearest_p = p
+ self._rebalance_access(nearest_p) # hook for balanced tree subclasses
+ return nearest_p
+
+
+ def delete(self, p):
+ """Remove the item at given Position."""
+ self._validate(p) # inherited from LinkedBinaryTree
+ if self.left(p) and self.right(p): # p has two children
+ replacement = self._subtree_last_position(self.left(p))
+ self._replace(p, replacement.element()) # from LinkedBinaryTree
+ p = replacement
+ # now p has at most one child
+ parent = self.parent(p)
+ self._delete(p) # inherited from LinkedBinaryTree
+ self._rebalance_delete(parent) # if root deleted, parent is None
+
+
+ def keys(self):
+ key_list = []
+ p = self.first()
+ while p is not None:
+ key_list.append(p.key())
+ p = self.after(p)
+ return key_list
+
+ #--------------------- public methods for (standard) map interface ---------------------
+ def __getitem__(self, k):
+ """Return value associated with key k (raise KeyError if not found)."""
+ if self.is_empty():
+ raise KeyError('Key Error: ' + repr(k))
+ else:
+ p = self._subtree_search(self.root(), k)
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ if k != p.key():
+ raise KeyError('Key Error: ' + repr(k))
+ return p.value()
+
+ def __setitem__(self, k, v):
+ """Assign value v to key k, overwriting existing value if present."""
+ if self.is_empty():
+ leaf = self._add_root(self._Item(k,v)) # from LinkedBinaryTree
+ else:
+ p = self._subtree_search(self.root(), k)
+ if p.key() == k:
+ p.element()._value = v # replace existing item's value
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ return
+ else:
+ item = self._Item(k,v)
+ if p.key() < k:
+ leaf = self._add_right(p, item) # inherited from LinkedBinaryTree
+ else:
+ leaf = self._add_left(p, item) # inherited from LinkedBinaryTree
+ self._rebalance_insert(leaf) # hook for balanced tree subclasses
+
+ def __delitem__(self, k):
+ """Remove item associated with key k (raise KeyError if not found)."""
+ if not self.is_empty():
+ p = self._subtree_search(self.root(), k)
+ if k == p.key():
+ self.delete(p) # rely on positional version
+ return # successful deletion complete
+ self._rebalance_access(p) # hook for balanced tree subclasses
+ raise KeyError('Key Error: ' + repr(k))
+
+ def __iter__(self):
+ """Generate an iteration of all keys in the map in order."""
+ p = self.first()
+ while p is not None:
+ yield p.key()
+ p = self.after(p)
+
+ #--------------------- public methods for sorted map interface ---------------------
+ def __reversed__(self):
+ """Generate an iteration of all keys in the map in reverse order."""
+ p = self.last()
+ while p is not None:
+ yield p.key()
+ p = self.before(p)
+
+ def find_min(self):
+ """Return (key,value) pair with minimum key (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self.first()
+ return (p.key(), p.value())
+
+ def find_max(self):
+ """Return (key,value) pair with maximum key (or None if empty)."""
+ if self.is_empty():
+ return None
+ else:
+ p = self.last()
+ return (p.key(), p.value())
+
+ def find_le(self, k):
+ """Return (key,value) pair with greatest key less than or equal to k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if k < p.key():
+ p = self.before(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_lt(self, k):
+ """Return (key,value) pair with greatest key strictly less than k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if not p.key() < k:
+ p = self.before(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_ge(self, k):
+ """Return (key,value) pair with least key greater than or equal to k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k) # may not find exact match
+ if p.key() < k: # p's key is too small
+ p = self.after(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def find_gt(self, k):
+ """Return (key,value) pair with least key strictly greater than k.
+
+ Return None if there does not exist such a key.
+ """
+ if self.is_empty():
+ return None
+ else:
+ p = self.find_position(k)
+ if not k < p.key():
+ p = self.after(p)
+ return (p.key(), p.value()) if p is not None else None
+
+ def linear_range_search(self, position, start, stop):
+ """
+ Iterate all position such that start < position.key < stop
+ Mind: linear_search function only return Position, not key value pair.
+
+ If start is None, searching begins from self.first()
+ If start is None, iteration begins with minimum key of map.
+ If end is None, iteration continues through the maximum key of map.
+ """
+ if not self.is_empty():
+ if position is not None:
+ p = position
+ else:
+ p = self.first()
+ while p is not None and (stop is None or p.key() < stop):
+ if p.key() >= start:
+ yield(p)
+ p = self.after(p)
+
+ def find_range(self, start, stop):
+ """Iterate all (key,value) pairs such that start <= key < stop.
+
+ If start is None, iteration begins with minimum key of map.
+ If stop is None, iteration continues through the maximum key of map.
+ """
+ if not self.is_empty():
+ if start is None:
+ p = self.first()
+ else:
+ # we initialize p with logic similar to find_ge
+ p = self.find_position(start)
+ if p.key() < start:
+ p = self.after(p)
+ while p is not None and (stop is None or p.key() < stop):
+ yield (p.key(), p.value())
+ p = self.after(p)
+
+ #--------------------- hooks used by subclasses to balance a tree ---------------------
+ def _rebalance_insert(self, p):
+ """Call to indicate that position p is newly added."""
+ pass
+
+ def _rebalance_delete(self, p):
+ """Call to indicate that a child of p has been removed."""
+ pass
+
+ def _rebalance_access(self, p):
+ """Call to indicate that position p was recently accessed."""
+ pass
+
+ #--------------------- nonpublic methods to support tree balancing ---------------------
+
+ def _relink(self, parent, child, make_left_child):
+ """Relink parent node with child node (we allow child to be None)."""
+ if make_left_child: # make it a left child
+ parent._left = child
+ else: # make it a right child
+ parent._right = child
+ if child is not None: # make child point to parent
+ child._parent = parent
+
+ def _rotate(self, p):
+ """Rotate Position p above its parent.
+
+ Switches between these configurations, depending on whether p==a or p==b.
+
+ b a
+ / \ / \
+ a t2 t0 b
+ / \ / \
+ t0 t1 t1 t2
+
+ Caller should ensure that p is not the root.
+ """
+ """Rotate Position p above its parent."""
+ x = p._node
+ y = x._parent # we assume this exists
+ z = y._parent # grandparent (possibly None)
+ if z is None:
+ self._root = x # x becomes root
+ x._parent = None
+ else:
+ self._relink(z, x, y == z._left) # x becomes a direct child of z
+ # now rotate x and y, including transfer of middle subtree
+ if x == y._left:
+ self._relink(y, x._right, True) # x._right becomes left child of y
+ self._relink(x, y, False) # y becomes right child of x
+ else:
+ self._relink(y, x._left, False) # x._left becomes right child of y
+ self._relink(x, y, True) # y becomes left child of x
+
+ def _restructure(self, x):
+ """Perform a trinode restructure among Position x, its parent, and its grandparent.
+
+ Return the Position that becomes root of the restructured subtree.
+
+ Assumes the nodes are in one of the following configurations:
+
+ z=a z=c z=a z=c
+ / \ / \ / \ / \
+ t0 y=b y=b t3 t0 y=c y=a t3
+ / \ / \ / \ / \
+ t1 x=c x=a t2 x=b t3 t0 x=b
+ / \ / \ / \ / \
+ t2 t3 t0 t1 t1 t2 t1 t2
+
+ The subtree will be restructured so that the node with key b becomes its root.
+
+ b
+ / \
+ a c
+ / \ / \
+ t0 t1 t2 t3
+
+ Caller should ensure that x has a grandparent.
+ """
+ """Perform trinode restructure of Position x with parent/grandparent."""
+ y = self.parent(x)
+ z = self.parent(y)
+ if (x == self.right(y)) == (y == self.right(z)): # matching alignments
+ self._rotate(y) # single rotation (of y)
+ return y # y is new subtree root
+ else: # opposite alignments
+ self._rotate(x) # double rotation (of x)
+ self._rotate(x)
+ return x # x is new subtree root
diff --git a/py/lib/binary_tree.py b/py/lib/binary_tree.py
new file mode 100644
index 0000000..56ae9e7
--- /dev/null
+++ b/py/lib/binary_tree.py
@@ -0,0 +1,74 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.tree import Tree
+
+class BinaryTree(Tree):
+ """Abstract base class representing a binary tree structure."""
+
+ # --------------------- additional abstract methods ---------------------
+ def left(self, p):
+ """Return a Position representing p's left child.
+
+ Return None if p does not have a left child.
+ """
+ raise NotImplementedError('must be implemented by subclass')
+
+ def right(self, p):
+ """Return a Position representing p's right child.
+
+ Return None if p does not have a right child.
+ """
+ raise NotImplementedError('must be implemented by subclass')
+
+ # ---------- concrete methods implemented in this class ----------
+ def sibling(self, p):
+ """Return a Position representing p's sibling (or None if no sibling)."""
+ parent = self.parent(p)
+ if parent is None: # p must be the root
+ return None # root has no sibling
+ else:
+ if p == self.left(parent):
+ return self.right(parent) # possibly None
+ else:
+ return self.left(parent) # possibly None
+
+ def children(self, p):
+ """Generate an iteration of Positions representing p's children."""
+ if self.left(p) is not None:
+ yield self.left(p)
+ if self.right(p) is not None:
+ yield self.right(p)
+
+ def inorder(self):
+ """Generate an inorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_inorder(self.root()):
+ yield p
+
+ def _subtree_inorder(self, p):
+ """Generate an inorder iteration of positions in subtree rooted at p."""
+ if self.left(p) is not None: # if left child exists, traverse its subtree
+ for other in self._subtree_inorder(self.left(p)):
+ yield other
+ yield p # visit p between its subtrees
+ if self.right(p) is not None: # if right child exists, traverse its subtree
+ for other in self._subtree_inorder(self.right(p)):
+ yield other
+
+ # override inherited version to make inorder the default
+ def positions(self):
+ """Generate an iteration of the tree's positions."""
+ return self.inorder() # make inorder the default
diff --git a/py/lib/linked_binary_tree.py b/py/lib/linked_binary_tree.py
new file mode 100644
index 0000000..a6cc58b
--- /dev/null
+++ b/py/lib/linked_binary_tree.py
@@ -0,0 +1,196 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.binary_tree import BinaryTree
+
+#ch8
+class LinkedBinaryTree(BinaryTree):
+ """Linked representation of a binary tree structure."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node:
+ """Lightweight, nonpublic class for storing a node."""
+ __slots__ = '_element', '_parent', '_left', '_right' # streamline memory usage
+
+ def __init__(self, element, parent=None, left=None, right=None):
+ self._element = element
+ self._parent = parent
+ self._left = left
+ self._right = right
+
+ #-------------------------- nested Position class --------------------------
+ class Position(BinaryTree.Position):
+ """An abstraction representing the location of a single element."""
+
+ def __init__(self, container, node):
+ """Constructor should not be invoked by user."""
+ self._container = container
+ self._node = node
+
+ def element(self):
+ """Return the element stored at this Position."""
+ return self._node._element
+
+ def __eq__(self, other):
+ """Return True if other is a Position representing the same location."""
+ return type(other) is type(self) and other._node is self._node
+
+ #------------------------------- utility methods -------------------------------
+ def _validate(self, p):
+ """Return associated node, if position is valid."""
+ if not isinstance(p, self.Position):
+ raise TypeError('p must be proper Position type')
+ if p._container is not self:
+ raise ValueError('p does not belong to this container')
+ if p._node._parent is p._node: # convention for deprecated nodes
+ raise ValueError('p is no longer valid')
+ return p._node
+
+ def _make_position(self, node):
+ """Return Position instance for given node (or None if no node)."""
+ return self.Position(self, node) if node is not None else None
+
+ #-------------------------- binary tree constructor --------------------------
+ def __init__(self):
+ """Create an initially empty binary tree."""
+ self._root = None
+ self._size = 0
+
+ #-------------------------- public accessors --------------------------
+ def __len__(self):
+ """Return the total number of elements in the tree."""
+ return self._size
+
+ def root(self):
+ """Return the root Position of the tree (or None if tree is empty)."""
+ return self._make_position(self._root)
+
+ def parent(self, p):
+ """Return the Position of p's parent (or None if p is root)."""
+ node = self._validate(p)
+ return self._make_position(node._parent)
+
+ def left(self, p):
+ """Return the Position of p's left child (or None if no left child)."""
+ node = self._validate(p)
+ return self._make_position(node._left)
+
+ def right(self, p):
+ """Return the Position of p's right child (or None if no right child)."""
+ node = self._validate(p)
+ return self._make_position(node._right)
+
+ def num_children(self, p):
+ """Return the number of children of Position p."""
+ node = self._validate(p)
+ count = 0
+ if node._left is not None: # left child exists
+ count += 1
+ if node._right is not None: # right child exists
+ count += 1
+ return count
+
+ #-------------------------- nonpublic mutators --------------------------
+ def _add_root(self, e):
+ """Place element e at the root of an empty tree and return new Position.
+
+ Raise ValueError if tree nonempty.
+ """
+ if self._root is not None:
+ raise ValueError('Root exists')
+ self._size = 1
+ self._root = self._Node(e)
+ return self._make_position(self._root)
+
+ def _add_left(self, p, e):
+ """Create a new left child for Position p, storing element e.
+
+ Return the Position of new node.
+ Raise ValueError if Position p is invalid or p already has a left child.
+ """
+ node = self._validate(p)
+ if node._left is not None:
+ raise ValueError('Left child exists')
+ self._size += 1
+ node._left = self._Node(e, node) # node is its parent
+ return self._make_position(node._left)
+
+ def _add_right(self, p, e):
+ """Create a new right child for Position p, storing element e.
+
+ Return the Position of new node.
+ Raise ValueError if Position p is invalid or p already has a right child.
+ """
+ node = self._validate(p)
+ if node._right is not None:
+ raise ValueError('Right child exists')
+ self._size += 1
+ node._right = self._Node(e, node) # node is its parent
+ return self._make_position(node._right)
+
+ def _replace(self, p, e):
+ """Replace the element at position p with e, and return old element."""
+ node = self._validate(p)
+ old = node._element
+ node._element = e
+ return old
+
+ def _delete(self, p):
+ """Delete the node at Position p, and replace it with its child, if any.
+
+ Return the element that had been stored at Position p.
+ Raise ValueError if Position p is invalid or p has two children.
+ """
+ node = self._validate(p)
+ if self.num_children(p) == 2:
+ raise ValueError('Position has two children')
+ child = node._left if node._left else node._right # might be None
+ if child is not None:
+ child._parent = node._parent # child's grandparent becomes parent
+ if node is self._root:
+ self._root = child # child becomes root
+ else:
+ parent = node._parent
+ if node is parent._left:
+ parent._left = child
+ else:
+ parent._right = child
+ self._size -= 1
+ node._parent = node # convention for deprecated node
+ return node._element
+
+ def _attach(self, p, t1, t2):
+ """Attach trees t1 and t2, respectively, as the left and right subtrees of the external Position p.
+
+ As a side effect, set t1 and t2 to empty.
+ Raise TypeError if trees t1 and t2 do not match type of this tree.
+ Raise ValueError if Position p is invalid or not external.
+ """
+ node = self._validate(p)
+ if not self.is_leaf(p):
+ raise ValueError('position must be leaf')
+ if not type(self) is type(t1) is type(t2): # all 3 trees must be same type
+ raise TypeError('Tree types must match')
+ self._size += len(t1) + len(t2)
+ if not t1.is_empty(): # attached t1 as left subtree of node
+ t1._root._parent = node
+ node._left = t1._root
+ t1._root = None # set t1 instance to empty
+ t1._size = 0
+ if not t2.is_empty(): # attached t2 as right subtree of node
+ t2._root._parent = node
+ node._right = t2._root
+ t2._root = None # set t2 instance to empty
+ t2._size = 0
diff --git a/py/lib/linked_queue.py b/py/lib/linked_queue.py
new file mode 100644
index 0000000..978b35b
--- /dev/null
+++ b/py/lib/linked_queue.py
@@ -0,0 +1,77 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+#from ..exceptions import Empty
+
+#ch7
+class LinkedQueue:
+ """FIFO queue implementation using a singly linked list for storage."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node:
+ """Lightweight, nonpublic class for storing a singly linked node."""
+ __slots__ = '_element', '_next' # streamline memory usage
+
+ def __init__(self, element, next):
+ self._element = element
+ self._next = next
+
+ #------------------------------- queue methods -------------------------------
+ def __init__(self):
+ """Create an empty queue."""
+ self._head = None
+ self._tail = None
+ self._size = 0 # number of queue elements
+
+ def __len__(self):
+ """Return the number of elements in the queue."""
+ return self._size
+
+ def is_empty(self):
+ """Return True if the queue is empty."""
+ return self._size == 0
+
+ def first(self):
+ """Return (but do not remove) the element at the front of the queue.
+
+ Raise Empty exception if the queue is empty.
+ """
+ if self.is_empty():
+ raise Empty('Queue is empty')
+ return self._head._element # front aligned with head of list
+
+ def dequeue(self):
+ """Remove and return the first element of the queue (i.e., FIFO).
+
+ Raise Empty exception if the queue is empty.
+ """
+ if self.is_empty():
+ raise Empty('Queue is empty')
+ answer = self._head._element
+ self._head = self._head._next
+ self._size -= 1
+ if self.is_empty(): # special case as queue is empty
+ self._tail = None # removed head had been the tail
+ return answer
+
+ def enqueue(self, e):
+ """Add an element to the back of queue."""
+ newest = self._Node(e, None) # node will be new tail node
+ if self.is_empty():
+ self._head = newest # special case: previously empty
+ else:
+ self._tail._next = newest
+ self._tail = newest # update reference to tail node
+ self._size += 1
diff --git a/py/lib/map_base.py b/py/lib/map_base.py
new file mode 100644
index 0000000..a93c1f7
--- /dev/null
+++ b/py/lib/map_base.py
@@ -0,0 +1,38 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from collections import MutableMapping
+
+#ch10
+class MapBase(MutableMapping):
+ """Our own abstract base class that includes a nonpublic _Item class."""
+
+ #------------------------------- nested _Item class -------------------------------
+ class _Item:
+ """Lightweight composite to store key-value pairs as map items."""
+ __slots__ = '_key', '_value'
+
+ def __init__(self, k, v):
+ self._key = k
+ self._value = v
+
+ def __eq__(self, other):
+ return self._key == other._key # compare items based on their keys
+
+ def __ne__(self, other):
+ return not (self == other) # opposite of __eq__
+
+ def __lt__(self, other):
+ return self._key < other._key # compare items based on their keys
diff --git a/py/lib/red_black_tree.py b/py/lib/red_black_tree.py
new file mode 100644
index 0000000..c5905c7
--- /dev/null
+++ b/py/lib/red_black_tree.py
@@ -0,0 +1,112 @@
+# Copyright 2015, Chen Sun
+#
+# Based on source code copyright by 2013, Michael H. Goldwasser
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.binary_search_tree import TreeMap
+
+"""
+usage:
+"""
+class RedBlackTreeMap(TreeMap):
+ """Sorted map implementation using a red-black tree."""
+
+ #-------------------------- nested _Node class --------------------------
+ class _Node(TreeMap._Node):
+ """Node class for red-black tree maintains bit that denotes color."""
+ __slots__ = '_red' # add additional data member to the Node class
+
+ def __init__(self, element, parent=None, left=None, right=None):
+ TreeMap._Node.__init__(self, element, parent, left, right)
+ self._red = True # new node red by default
+
+ #------------------------- positional-based utility methods -------------------------
+ # we consider a nonexistent child to be trivially black
+ def _set_red(self, p): p._node._red = True
+ def _set_black(self, p): p._node._red = False
+ def _set_color(self, p, make_red): p._node._red = make_red
+ def _is_red(self, p): return p is not None and p._node._red
+ def _is_red_leaf(self, p): return self._is_red(p) and self.is_leaf(p)
+
+ def _get_red_child(self, p):
+ """Return a red child of p (or None if no such child)."""
+ for child in (self.left(p), self.right(p)):
+ if self._is_red(child):
+ return child
+ return None
+
+ #------------------------- support for insertions -------------------------
+ def _rebalance_insert(self, p):
+ self._resolve_red(p) # new node is always red
+
+ def _resolve_red(self, p):
+ if self.is_root(p):
+ self._set_black(p) # make root black
+ else:
+ parent = self.parent(p)
+ if self._is_red(parent): # double red problem
+ uncle = self.sibling(parent)
+ if not self._is_red(uncle): # Case 1: misshapen 4-node
+ middle = self._restructure(p) # do trinode restructuring
+ self._set_black(middle) # and then fix colors
+ self._set_red(self.left(middle))
+ self._set_red(self.right(middle))
+ else: # Case 2: overfull 5-node
+ grand = self.parent(parent)
+ self._set_red(grand) # grandparent becomes red
+ self._set_black(self.left(grand)) # its children become black
+ self._set_black(self.right(grand))
+ self._resolve_red(grand) # recur at red grandparent
+
+ #------------------------- support for deletions -------------------------
+ def _rebalance_delete(self, p):
+ if len(self) == 1:
+ self._set_black(self.root()) # special case: ensure that root is black
+ elif p is not None:
+ n = self.num_children(p)
+ if n == 1: # deficit exists unless child is a red leaf
+ c = next(self.children(p))
+ if not self._is_red_leaf(c):
+ self._fix_deficit(p, c)
+ elif n == 2: # removed black node with red child
+ if self._is_red_leaf(self.left(p)):
+ self._set_black(self.left(p))
+ else:
+ self._set_black(self.right(p))
+
+ def _fix_deficit(self, z, y):
+ """Resolve black deficit at z, where y is the root of z's heavier subtree."""
+ if not self._is_red(y): # y is black; will apply Case 1 or 2
+ x = self._get_red_child(y)
+ if x is not None: # Case 1: y is black and has red child x; do "transfer"
+ old_color = self._is_red(z)
+ middle = self._restructure(x)
+ self._set_color(middle, old_color) # middle gets old color of z
+ self._set_black(self.left(middle)) # children become black
+ self._set_black(self.right(middle))
+ else: # Case 2: y is black, but no red children; recolor as "fusion"
+ self._set_red(y)
+ if self._is_red(z):
+ self._set_black(z) # this resolves the problem
+ elif not self.is_root(z):
+ self._fix_deficit(self.parent(z), self.sibling(z)) # recur upward
+ else: # Case 3: y is red; rotate misaligned 3-node and repeat
+ self._rotate(y)
+ self._set_black(y)
+ self._set_red(z)
+ if z == self.right(y):
+ self._fix_deficit(z, self.left(z))
+ else:
+ self._fix_deficit(z, self.right(z))
diff --git a/py/lib/tree.py b/py/lib/tree.py
new file mode 100644
index 0000000..921ba67
--- /dev/null
+++ b/py/lib/tree.py
@@ -0,0 +1,151 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+from lib.linked_queue import LinkedQueue #LinkedQueue is only used for bfs
+import collections
+
+#ch8
+class Tree:
+ """Abstract base class representing a tree structure."""
+
+ #------------------------------- nested Position class -------------------------------
+ class Position:
+ """An abstraction representing the location of a single element within a tree.
+
+ Note that two position instaces may represent the same inherent location in a tree.
+ Therefore, users should always rely on syntax 'p == q' rather than 'p is q' when testing
+ equivalence of positions.
+
+ we define a tree ADT using the concept of apositionas an abstraction for a node of a tree
+ """
+
+ def element(self):
+ """Return the element stored at this Position."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __eq__(self, other):
+ """Return True if other Position represents the same location."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __ne__(self, other):
+ """Return True if other does not represent the same location."""
+ return not (self == other) # opposite of __eq__
+
+ # ---------- abstract methods that concrete subclass must support ----------
+ def root(self):
+ """Return Position representing the tree's root (or None if empty)."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def parent(self, p):
+ """Return Position representing p's parent (or None if p is root)."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def num_children(self, p):
+ """Return the number of children that Position p has."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def children(self, p):
+ """Generate an iteration of Positions representing p's children."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ def __len__(self):
+ """Return the total number of elements in the tree."""
+ raise NotImplementedError('must be implemented by subclass')
+
+ # ---------- concrete methods implemented in this class ----------
+ def is_root(self, p):
+ """Return True if Position p represents the root of the tree."""
+ return self.root() == p
+
+ def is_leaf(self, p):
+ """Return True if Position p does not have any children."""
+ return self.num_children(p) == 0
+
+ def is_empty(self):
+ """Return True if the tree is empty."""
+ return len(self) == 0
+
+ def depth(self, p):
+ """Return the number of levels separating Position p from the root."""
+ if self.is_root(p):
+ return 0
+ else:
+ return 1 + self.depth(self.parent(p))
+
+ def _height1(self): # works, but O(n^2) worst-case time
+ """Return the height of the tree."""
+ return max(self.depth(p) for p in self.positions() if self.is_leaf(p))
+
+ def _height2(self, p): # time is linear in size of subtree
+ """Return the height of the subtree rooted at Position p."""
+ if self.is_leaf(p):
+ return 0
+ else:
+ return 1 + max(self._height2(c) for c in self.children(p))
+
+ def height(self, p=None):
+ """Return the height of the subtree rooted at Position p.
+
+ If p is None, return the height of the entire tree.
+ """
+ if p is None:
+ p = self.root()
+ return self._height2(p) # start _height2 recursion
+
+ def __iter__(self):
+ """Generate an iteration of the tree's elements."""
+ for p in self.positions(): # use same order as positions()
+ yield p.element() # but yield each element
+
+ def positions(self):
+ """Generate an iteration of the tree's positions."""
+ return self.preorder() # return entire preorder iteration
+
+ def preorder(self):
+ """Generate a preorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_preorder(self.root()): # start recursion
+ yield p
+
+ def _subtree_preorder(self, p):
+ """Generate a preorder iteration of positions in subtree rooted at p."""
+ yield p # visit p before its subtrees
+ for c in self.children(p): # for each child c
+ for other in self._subtree_preorder(c): # do preorder of c's subtree
+ yield other # yielding each to our caller
+
+ def postorder(self):
+ """Generate a postorder iteration of positions in the tree."""
+ if not self.is_empty():
+ for p in self._subtree_postorder(self.root()): # start recursion
+ yield p
+
+ def _subtree_postorder(self, p):
+ """Generate a postorder iteration of positions in subtree rooted at p."""
+ for c in self.children(p): # for each child c
+ for other in self._subtree_postorder(c): # do postorder of c's subtree
+ yield other # yielding each to our caller
+ yield p # visit p after its subtrees
+
+ def breadthfirst(self):
+ """Generate a breadth-first iteration of the positions of the tree."""
+ if not self.is_empty():
+ fringe = LinkedQueue() # known positions not yet yielded
+ fringe.enqueue(self.root()) # starting with the root
+ while not fringe.is_empty():
+ p = fringe.dequeue() # remove from front of the queue
+ yield p # report this position
+ for c in self.children(p):
+ fringe.enqueue(c) # add children to back of queue
diff --git a/py/vcfcompare.py b/py/vcfcompare.py
new file mode 100755
index 0000000..7d23f22
--- /dev/null
+++ b/py/vcfcompare.py
@@ -0,0 +1,1098 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Author: Chen Sun(chensun at cse.psu.edu)
+"""
+
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+#elif sys.hexversion > 0x03000000:
+# print ("python 3")
+
+import subprocess
+import argparse
+import os
+import copy
+from lib.red_black_tree import RedBlackTreeMap
+
+import numpy
+import scipy.cluster.hierarchy as hcluster
+import itertools
+
+# for profile memory usage
+#from memory_profiler import profile
+
+citation = 'About algorithm used in VCF-Compare, please refer to "Method for Cross-Validating Variant Call Set" Section in our paper.'+'\n Please cite our paper.'
+
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-r', '--reference', required=True, help = 'reference vcf file path, usually larger than query vcf file')
+parser.add_argument('-q', '--query', required=True, help = 'query vcf file path')
+parser.add_argument('-g', '--genome', required=True, help= 'reference genome file path, fasta file format')
+parser.add_argument('-p', '--false_positive', help='false positive, i.e. mismatch vcf entries in query vcf file, default=false_positive.vcf', default='false_positive.vcf')
+parser.add_argument('-n', '--false_negative', help='false negative, i.e. mismatch vcf entries in reference vcf file, default=false_negative.vcf', default='false_negative.vcf')
+#parser.add_argument('-t', '--true_positive', help='true positive bed file position', default='true_positive.bed')
+parser.add_argument('-o', '--output', help='output matched variants in stage 2 and 3, default=multi_match.out', default='multi_match.out')
+parser.add_argument('-d', '--direct_search', help='if activate, only perform stage 1, default=not activate', action = 'store_true')
+parser.add_argument('-c', '--chr', help='chromosome name or id, used for parallel multi genome analysis', default='.')
+parser.add_argument('-s', '--stat', help='append statistics result into a file, useful for parallel multi genome analysis', default='stat.txt')
+args = parser.parse_args()
+
+#match_set = []
+
+#matched_quality_set = []
+#refPos_quality = {}
+######################### for debug ###########################
+refPos_vcfEntry = {}
+quePos_vcfEntry = {}
+#ref_match_total = set()
+#que_match_total = set()
+
+def direct_search(refPos_snp, quePos_snp):
+ global ref_match_total
+ global que_match_total
+ delList = []
+ num = 0
+ for key in quePos_snp:
+ if key in refPos_snp:
+ if refPos_snp[key] == quePos_snp[key]:
+ delList.append(key)
+ #match_set.append(key)
+ num += 1
+ #ref_match_total.add(key)
+ #que_match_total.add(key)
+ match_file = open('direct_search.txt', 'w')
+ for key in delList:
+ match_string = str(key) + ',' + str(refPos_snp[key]) + '\t' + str(key) + ',' + str(quePos_snp[key]) + '\n'
+ match_file.write(match_string)
+ #matched_quality_set.append(refPos_quality[key])
+ refPos_snp.pop(key, None) # delete value with key
+ quePos_snp.pop(key, None)
+ match_file.close()
+
+ #with open('matched_quality.txt', 'w') as quality:
+ # for q in matched_quality_set:
+ # quality.write(str(q)+'\n')
+
+ #print ("direct search found:", num)
+
+
+def modify_sequence(sequence, pos, snpSet):
+ if len(snpSet) != 3:
+ print ("Error: snp set size not right.")
+ ref = snpSet[1]
+ alt = snpSet[2]
+ if sequence[pos:pos+len(ref)].upper() != ref.upper():
+ pass
+ result = sequence[:pos] + alt + sequence[pos+len(ref):]
+ return result
+
+def near_search(refPos_snp, quePos_snp, genome, blockSize):
+
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+
+ num = 0
+ for key in quePos_snp:
+ num += 1
+ ref_element = refPos_snp.find_nearest(key) # return a position
+ ref_snp = ref_element.value()
+ que_snp = quePos_snp[key]
+ refPos = ref_element.key()
+ quePos = key
+
+ if abs(refPos-key) > blockSize:
+ continue
+ if ref_snp[0] != que_snp[0]:
+ continue
+
+ #get the substring
+ seqStart = min(key, refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ refIndex = refPos-seqStart
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+ if refSequence.upper() == queSequence.upper():
+ queRemoveList.append(quePos)
+ ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ output.write(match_string)
+ refPos_snp.pop(refPos, None)
+ break
+
+ output.close()
+ for pos in queRemoveList:
+ match_set.append(pos)
+ quePos_snp.pop(pos, None)
+
+def powerful_near_search(refPos_snp, quePos_snp, genome, blockSize):
+
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+ num = 0
+ for key in quePos_snp:
+ num += 1
+ #print num
+ minPos = max(key-blockSize, 0)
+ maxPos = min(key+blockSize, genomeLen-1)
+ que_snp = quePos_snp[key]
+ quePos = key
+ for (k,v) in refPos_snp.find_range(minPos, maxPos):
+
+ ref_snp = v
+ refPos = k
+ if ref_snp[0] != que_snp[0]:
+ continue
+
+ #get the substring
+ seqStart = min(key, refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ refIndex = refPos-seqStart
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+ if refSequence.upper() == queSequence.upper():
+ queRemoveList.append(quePos)
+ ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ #output.write(match_string)
+ refPos_snp.pop(refPos, None)
+ break
+
+ output.close()
+ for pos in queRemoveList:
+ match_set.append(pos)
+ quePos_snp.pop(pos, None)
+
+def modify_by_list(pos_snp, posList, sequence, bound):
+ modList = copy.deepcopy(posList)
+ modList.sort(reverse=True)
+ for pos in modList:
+ snp = pos_snp[pos]
+ if len(snp) != 3:
+ print ("Error: snp set size not right.")
+ index = pos-bound
+ ref = snp[1]
+ alt = snp[2]
+ if sequence[index:index+len(ref)].upper() != ref.upper():
+ pass
+ sequence = sequence[:index] + alt + sequence[index+len(ref):]
+ return sequence
+
+def complex_search(refPos_snp, quePos_snp, genome, rev):
+ #global ref_match_total
+ #global que_match_total
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a+') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+ num = 0
+
+ start_position = None
+ for (key, value) in quePos_snp.find_range(None, None):
+ num += 1
+ que_snp = value
+ quePos = key
+ minPos = key
+ maxPos = min(key+len(que_snp[1])-1, genomeLen-1) + 1
+ candidateRefPos = []
+ candidateRefNode = []
+ temp_refPos_snp = {}
+ min_refPos = 3000000000
+ max_refPos = 0
+ for p in refPos_snp.linear_range_search(start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ if min_refPos > k:
+ min_refPos = k
+ if max_refPos < k:
+ max_refPos = k
+ candidateRefNode.append(p)
+ candidateRefPos.append(k)
+ temp_refPos_snp[k] = v
+ #get the substring
+ if len(candidateRefPos) == 0:
+ continue
+
+ before = refPos_snp.before(candidateRefNode[0])
+ while before is not None and before.key() + len(before.value()[1]) - 1 >= minPos:
+ #print ('find before boundary in stage 2')
+ candidateRefNode.insert(0, before)
+ min_refPos = before.key()
+ candidateRefPos.append(before.key())
+ temp_refPos_snp[before.key()] = before.value()
+ before = refPos_snp.before(candidateRefNode[0])
+
+ candidateRefPos.sort()
+ seqStart = min(key, min_refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, max_refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, seqStart)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+
+ if refSequence.upper() == queSequence.upper():
+ #matched
+ start_position = refPos_snp.after(candidateRefNode[-1])
+ queRemoveList.append(quePos)
+ ref_variants = ''
+ query_variants = ''
+ if not rev:
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #ref_match_total.add(pos)
+ #be sure to recover
+ refPos_snp.pop(pos)
+ ref_pos = candidateRefPos[-1]
+ ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+
+ #ref_match_total.add(ref_pos)
+ # be sure to recover
+ refPos_snp.pop(ref_pos)
+
+ #multi_match_ref += 1
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ else:
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ query_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #que_match_total.add(pos)
+ refPos_snp.pop(pos)
+ ref_pos = candidateRefPos[-1]
+ query_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+ #que_match_total.add(ref_pos)
+ refPos_snp.pop(ref_pos)
+ #multi_match_ref += 1
+ ref_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ #output.write(match_string)
+ else:
+ start_position = candidateRefNode[0]
+ output.close()
+ for pos in queRemoveList:
+ #match_set.append(pos)
+ #que_match_total.add(pos)
+ quePos_snp.pop(pos, None)
+
+def convert_substitution(pos_list, pos_snp, subsequence, low_bound):
+
+ print ('convert_substitution: unfinished function')
+ return
+
+ indel_list = []
+ pos_list.sort()
+ for pos in pos_list:
+ variant = pos_snp[pos]
+ relative_pos = pos - low_bound
+ ref = variant[0]
+ alt = variant[1]
+
+ if len(ref) == len(alt):
+ assert len(ref) == 1, 'snp should be normalized and decomposed.'
+ temp_del = [relative_pos, -1, ref]
+ temp_ins = [relative_pos, 1, ref]
+ indel_list.append(temp_del)
+ indel_list.append(temp_ins)
+ elif len(ref) > len(alt): # deletion
+ del_position = relative_pos + len(ref) - 1
+ for i in range(len(ref)-1):
+ del_n = ref[i+1]
+ temp_del = [del_position, -1, del_n]
+ indel_list.insert(temp_del)
+ elif len(ref) < len(alt): # insertion
+ ins_position = relative_pos + 1
+ for i in range(len(alt)-1):
+ ins_n = alt[i+1]
+ temp_ins = [ins_position, 1, ins_n]
+ indel_list.insert(temp_ins)
+
+
+def check_transition_theory(candidateRefPos, candidateQuePos, temp_refPos_snp, temp_quePos_snp, subSequence, lowBound):
+ convert_substitution(candidateRefPos, temp_refPos_snp, subSequence, lowBound)
+
+def multi_search(refPos_snp, quePos_snp, genome, blockSize):
+ #global ref_match_total
+ #global que_match_total
+ multi_match = 0
+ multi_match_ref = 0
+ multi_match_que = 0
+ one2multi = 0
+ multi2multi = 0
+
+ genomeLen = len(genome)
+ output = open(args.output, 'a+') #open output file for
+ refPosDelSet = set()
+ quePosDelSet = set()
+
+
+ #debug = False
+ ref_start_position = None
+ que_start_position = None
+ for key in quePos_snp.keys()[:]:
+
+ if not key in quePos_snp: # logN operation
+ continue
+
+ candidateRefPos = []
+ candidateQuePos = []
+ candidateRefNode = []
+ candidateQueNode = []
+ minPos = max(key-blockSize, 0)
+ maxPos = min(key+blockSize, genomeLen-1) + 1
+
+ temp_refPos_snp = {}
+ for p in refPos_snp.linear_range_search(ref_start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ candidateRefNode.append(p)
+ candidateRefPos.append(k)
+ temp_refPos_snp[k] = v
+
+ temp_quePos_snp = {}
+ for p in quePos_snp.linear_range_search(que_start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ candidateQueNode.append(p)
+ candidateQuePos.append(k)
+ temp_quePos_snp[k] = v
+
+ if len(candidateQuePos) == 0:
+ print ("Error: query empty")
+ continue
+
+ if len(candidateRefPos) == 0:
+ continue
+ min_ref_pos = candidateRefPos[0]
+ max_ref_pos = candidateRefPos[-1] + len(temp_refPos_snp[candidateRefPos[-1]][1]) - 1
+
+ min_que_pos = candidateQuePos[0]
+ max_que_pos = candidateQuePos[-1] + len(temp_quePos_snp[candidateQuePos[-1]][1]) - 1
+
+ """
+ ref_before = refPos_snp.before(candidateRefNode[0])
+ que_before = quePos_snp.before(candidateQueNode[0])
+ while (ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos) or (que_before is not None and que_before.key() + len(que_before.value()[0])-1 > min_ref_pos):
+ #print ('find before boundary in stage 3')
+ if ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos :
+ candidateRefNode.insert(0, ref_before)
+ min_ref_pos = ref_before.key()
+ candidateRefPos.insert(0, ref_before.key())
+ temp_refPos_snp[ref_before.key()] = ref_before.value()
+ ref_before = refPos_snp.before(candidateRefNode[0])
+
+ if que_before is not None and que_before.key() + len(que_before.value()[0]) - 1 >= min_ref_pos :
+ candidateQueNode.insert(0, que_before)
+ min_que_pos = que_before.key()
+ candidateQuePos.insert(0, que_before.key())
+ temp_quePos_snp[que_before.key()] = que_before.value()
+ que_before = quePos_snp.before(candidateQueNode[0])
+
+ ref_after = refPos_snp.after(candidateRefNode[-1])
+ que_after = quePos_snp.after(candidateQueNode[-1])
+ while (ref_after is not None and ref_after.key() <= max_que_pos) or (que_after is not None and que_after.key() <= max_ref_pos):
+ #print ('find after boundary in stage 3')
+ if ref_after is not None and ref_after.key() <= max_que_pos :
+ candidateRefNode.append(ref_after)
+ max_ref_pos = ref_after.key() + len(ref_after.value()[1]) - 1
+ candidateRefPos.append(ref_after.key())
+ temp_refPos_snp[ref_after.key()] = ref_after.value()
+ ref_after = refPos_snp.after(candidateRefNode[-1])
+
+ if que_after is not None and que_after.key() <= max_ref_pos :
+ candidateQueNode.append(que_after)
+ max_que_pos = que_after.key() + len(que_after.value()[1]) - 1
+ candidateQuePos.append(que_after.key())
+ temp_quePos_snp[que_after.key()] = que_after.value()
+ que_after = quePos_snp.after(candidateQueNode[-1])
+
+ """
+ lowBound = candidateRefPos[0]
+ upperBound = candidateRefPos[-1]
+
+
+ if lowBound > candidateQuePos[0]:
+ lowBound = candidateQuePos[0]
+ if upperBound < candidateQuePos[-1]:
+ upperBound = candidateQuePos[-1]
+
+ lowBound = max(0, lowBound-100)
+ upperBound = min(upperBound+100, genomeLen-1)
+
+ subSequence = genome[lowBound: upperBound+1]
+
+ refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, lowBound)
+ queSequence = modify_by_list(temp_quePos_snp, candidateQuePos, subSequence, lowBound)
+
+ if refSequence.upper() == queSequence.upper():
+ #print ("multi_search works")
+ ref_start_position = refPos_snp.after(candidateRefNode[-1])
+ que_start_position = quePos_snp.after(candidateQueNode[-1])
+ ref_variants = ''
+ query_variants = ''
+ #check_transition_theory(candidateRefPos, candidateQuePos, temp_refPos_snp, temp_quePos_snp, subSequence, lowBound)
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #ref_match_total.add(pos)
+ refPos_snp.pop(pos)
+ multi_match_ref += 1
+ ref_pos = candidateRefPos[-1]
+ ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+ #ref_match_total.add(ref_pos)
+ refPos_snp.pop(ref_pos)
+ multi_match_ref += 1
+
+ for index in range(len(candidateQuePos)-1):
+ pos = candidateQuePos[index]
+ query_variants += '{},{},{};'.format(pos, temp_quePos_snp[pos][1], temp_quePos_snp[pos][2])
+ #que_match_total.add(pos)
+ quePos_snp.pop(pos)
+ #quePosList.remove(pos)
+ #match_set.append(pos)
+ multi_match_que += 1
+ #quePosDelSet.add(pos)
+ que_pos = candidateQuePos[-1]
+ query_variants += '{},{},{}'.format(que_pos, temp_quePos_snp[que_pos][1], temp_quePos_snp[que_pos][2])
+ #que_match_total.add(que_pos)
+ quePos_snp.pop(que_pos)
+ #quePosList.remove(que_pos)
+ #match_set.append(que_pos)
+ multi_match_que += 1
+ #quePosDelSet.add(que_pos)
+
+ output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+ #output.write(match_string)
+ multi_match += 1
+
+ if len(candidateRefPos) == 1 or len(candidateQuePos) == 1:
+ one2multi += 1
+ else:
+ multi2multi += 1
+ else:
+ ref_start_position = candidateRefNode[0]
+ que_start_position = candidateQueNode[0]
+
+ output.close()
+ #print (multi_match, multi_match_ref, multi_match_que, one2multi, multi2multi)
+
+def match_by_tuple(ref_choice, que_choice, temp_refPos_snp, temp_quePos_snp, sequence):
+ ref_choice_list = list(ref_choice)
+ que_choice_list = list(que_choice)
+ ref_choice_list.sort()
+ que_choice_list.sort()
+
+ min_pos = min(ref_choice_list[0], que_choice_list[0])
+ max_pos = max(ref_choice_list[-1], que_choice_list[-1])
+
+ min_pos -= 100
+ max_pos += 100
+
+ min_pos = max(0, min_pos)
+ max_pos = min(len(sequence)-1, max_pos)
+
+ sub_sequence = sequence[min_pos: max_pos+1]
+ ref_sequence = modify_by_list(temp_refPos_snp, ref_choice_list, sub_sequence, min_pos)
+ que_sequence = modify_by_list(temp_quePos_snp, que_choice_list, sub_sequence, min_pos)
+
+ return ref_sequence.upper() == que_sequence.upper()
+
+def cluster_search_old(refPos_snp, quePos_snp, data_list, cluster_list, data_list_ref_que_dict, sequence):
+ # refPos_snp and quePos_snp are red_black_tree_map which operates like dictionary expect keys are sorted.
+
+ # all index less than data_list_index_threshold is in refPos_snp
+ # otherwise, it is in quePos_snp
+ print 'cluster search'
+
+ cluster_pos = {}
+ for index in range(len(cluster_list)):
+ cluster_id = cluster_list[index]
+ pos = data_list[index]
+ if cluster_id in cluster_pos:
+ cluster_pos[cluster_id].append(pos)
+ else:
+ cluster_pos[cluster_id] = [pos]
+
+ print 'iterate clusters'
+
+ for cluster_id in cluster_pos:
+ pos_list = cluster_pos[cluster_id]
+ if len(pos_list) <= 2:
+ continue
+ candidateRefPos = []
+ candidateQuePos = []
+ temp_refPos_snp = {}
+ temp_quePos_snp = {}
+ min_pos = len(sequence) - 1
+ max_pos = 0
+
+ for temp_pos in pos_list:
+ if data_list_ref_que_dict[temp_pos] > 0:
+ candidateRefPos.append(temp_pos)
+ temp_refPos_snp[temp_pos] = refPos_snp[temp_pos]
+ else:
+ candidateQuePos.append(temp_pos)
+ temp_quePos_snp[temp_pos] = quePos_snp[temp_pos]
+
+ if len(candidateRefPos) <= 1 or len(candidateQuePos) <= 1:
+ continue
+ # now we have the candidateRefPos and candidateQuePos
+ # next step is to permutate all combinations
+ # rule is that at least one should from each list
+ is_matched = False
+ for i in range(1, len(candidateRefPos), 1):
+ if is_matched:
+ break
+ ref_combination_list = list(itertools.combinations(candidateRefPos, i))
+ for j in range(1, len(candidateQuePos), 1):
+ if is_matched:
+ break
+ que_combination_list = list(itertools.combinations(candidateQuePos, j))
+
+ #print ref_combination_list
+ #print que_combination_list
+
+ for ref_choice in ref_combination_list:
+ if is_matched:
+ break
+ for que_choice in que_combination_list:
+ is_matched = match_by_tuple(ref_choice, que_choice, temp_refPos_snp, temp_quePos_snp, sequence)
+ if is_matched:
+ for pos in ref_choice:
+ refPos_snp.pop(pos)
+ for pos in que_choice:
+ quePos_snp.pop(pos)
+ break
+# ----------------------------end of cluster_search_old------------------------------------------------
+
+
+
+def cluster_search(refPos_snp, quePos_snp, data_list, cluster_list, sequence):
+ """
+ cluster_search use hash table to reduce running time from 2^(mn) to 2^m + 2^n
+ """
+ # refPos_snp and quePos_snp are red_black_tree_map which operates like dictionary expect keys are sorted.
+
+ # all index less than data_list_index_threshold is in refPos_snp
+ # otherwise, it is in quePos_snp
+ print 'cluster search'
+
+ output = open(args.output, 'a') #open output file for
+ cluster_data = {}
+ for index in range(len(cluster_list)):
+ cluster_id = cluster_list[index]
+ data = data_list[index]
+ #print data, cluster_id
+ if cluster_id in cluster_data:
+ cluster_data[cluster_id].append(data)
+ else:
+ cluster_data[cluster_id] = [data]
+
+ print 'iterate clusters'
+ #print cluster_data[1222]
+ cluster_num = 0
+ for cluster_id in sorted(cluster_data):
+ cluster_num += 1
+ #print cluster_id
+ data_list = cluster_data[cluster_id]
+ if len(data_list) <= 2:
+ continue
+ candidateRefPos = []
+ candidateQuePos = []
+ temp_refPos_snp = {}
+ temp_quePos_snp = {}
+ min_pos = data_list[0][0]
+ max_pos = data_list[-1][0]
+
+ min_pos -= 100
+ max_pos += 100
+
+ min_pos = max(0, min_pos)
+ max_pos = min(max_pos, len(sequence)-1)
+
+ sub_sequence = sequence[min_pos: max_pos+1]
+
+ for temp_data in data_list:
+ temp_pos = temp_data[0]
+ if temp_data[1] > 0:
+ candidateRefPos.append(temp_pos)
+ temp_refPos_snp[temp_pos] = refPos_snp[temp_pos]
+ else:
+ candidateQuePos.append(temp_pos)
+ temp_quePos_snp[temp_pos] = quePos_snp[temp_pos]
+
+ if len(candidateRefPos) <= 1 and len(candidateQuePos) <= 1:
+ continue
+
+ candidateRefPos.sort()
+ candidateQuePos.sort()
+ # now we have the candidateRefPos and candidateQuePos
+ # next step is to permutate all combinations
+ # rule is that should pick at least one from each list
+
+ #if cluster_id == 1222:
+ # print candidateRefPos, candidateQuePos
+
+ ref_sequence_choice = {}
+ ref_pos_del_set = set()
+ que_pos_del_set = set()
+ for i in range(1, len(candidateRefPos)+1, 1):
+ ref_combination_list = list(itertools.combinations(candidateRefPos, i))
+ for ref_combination in ref_combination_list:
+ ref_choice_list = list(ref_combination)
+ ref_sequence = modify_by_list(temp_refPos_snp, ref_choice_list, sub_sequence, min_pos)
+ ref_sequence = ref_sequence.upper()
+ ref_sequence_choice[ref_sequence] = ref_choice_list
+
+ for j in range(1, len(candidateQuePos)+1, 1):
+ que_combination_list = list(itertools.combinations(candidateQuePos, j))
+ for que_combination in que_combination_list:
+ que_choice_list = list(que_combination)
+ que_sequence = modify_by_list(temp_quePos_snp, que_choice_list, sub_sequence, min_pos)
+ que_sequence = que_sequence.upper()
+ if que_sequence in ref_sequence_choice:
+ ref_choice_list = ref_sequence_choice[que_sequence]
+ #print 'matched', ref_choice_list, que_choice_list
+ for pos in ref_choice_list:
+ ref_pos_del_set.add(pos)
+ for pos in que_choice_list:
+ que_pos_del_set.add(pos)
+
+ #===============================output matching results========================================
+ if len(ref_pos_del_set) == 0 or len(que_pos_del_set) == 0:
+ continue
+
+ ref_pos_del_list = list(ref_pos_del_set)
+ que_pos_del_list = list(que_pos_del_set)
+
+ ref_variants = ""
+ for index in range(len(ref_pos_del_list)-1):
+ pos = ref_pos_del_list[index]
+ ref_variants += '{},{},{};'.format(pos, refPos_snp[pos][1], refPos_snp[pos][2])
+ refPos_snp.pop(pos)
+ ref_pos = ref_pos_del_list[-1]
+ ref_variants += '{},{},{}'.format(ref_pos, refPos_snp[ref_pos][1], refPos_snp[ref_pos][2])
+ refPos_snp.pop(ref_pos)
+
+ query_variants = ""
+ for index in range(len(que_pos_del_list)-1):
+ pos = que_pos_del_list[index]
+ query_variants += '{},{},{};'.format(pos, quePos_snp[pos][1], quePos_snp[pos][2])
+ quePos_snp.pop(pos)
+ que_pos = que_pos_del_list[-1]
+ query_variants += '{},{},{}'.format(que_pos, quePos_snp[que_pos][1], quePos_snp[que_pos][2])
+ quePos_snp.pop(que_pos)
+
+ #output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+ output_info = '{},{},{}'.format(min_pos, max_pos, sub_sequence)
+ match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+ output.write(match_string)
+ #==============================output matching results========================================
+
+ #for pos in ref_pos_del_set:
+ #refPos_snp.pop(pos)
+ #for pos in que_pos_del_set:
+ #quePos_snp.pop(pos)
+
+ output.close()
+
+def report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum):
+ positiveFile = open(args.false_positive, "a+")
+ negativeFile = open(args.false_negative, "a+")
+
+ #query_mismatch_file = open(args.false_positive, 'w')
+ #ref_mismatch_file = open(args.false_negative, 'w')
+
+ #true_pos_file = open(args.true_positive, 'w')
+
+ refList = list(refPos_snp.keys())
+ refList.sort()
+ for pos in refList:
+ #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+ s = refPos_vcfEntry[pos] + '\n'
+ negativeFile.write(s)
+ negativeFile.close()
+
+ queList = list(quePos_snp.keys())
+ queList.sort()
+ for pos in queList:
+ #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+ s = quePos_vcfEntry[pos] + '\n'
+ positiveFile.write(s)
+ positiveFile.close()
+
+ #match_set.sort()
+ #for pos in match_set:
+ # s = args.chr + '\t' + str(pos) + '\t' + str(pos+1) + '\n'
+ # true_pos_file.write(s)
+ #true_pos_file.close()
+
+ print ('\n######### Matching Result ################\n')
+ print (' ref total: {}\n que total: {}\n ref matches: {}\n que '\
+ 'matches: {}\n ref mismatch: {}\n alt mismatch: {}\n'.format(\
+ refOriginalNum, queOriginalNum,refOriginalNum-len(refPos_snp),\
+ queOriginalNum-len(quePos_snp) , len(refPos_snp), len(quePos_snp)))
+
+ stat_file = open(args.stat, 'a+')
+ stat_file.write('{}\t{}\t{}\t{}\t{}\n'.format(args.chr, refOriginalNum,\
+ queOriginalNum, refOriginalNum-len(refPos_snp), \
+ queOriginalNum-len(quePos_snp)))
+ stat_file.close()
+ #print (len(ref_match_total), len(que_match_total))
+ #print multi_match, multi_match_ref, multi_match_que
+
+# check if sequence is exactly a tandem repeat
+def check_tandem_repeat(sequence):
+ sequence_length = len(sequence)
+ end_index = sequence_length / 2 + 1
+ final_checking = False
+ for repeat_length in range(1, end_index, 1):
+ #if sequence_length % repeat_length != 0:
+ # continue
+ is_tandem_repeat = True
+ repeat_region = sequence[:repeat_length]
+ start_position = repeat_length
+ while(start_position < len(sequence)):
+ if start_position + repeat_length > sequence_length:
+ break
+ matching_region = sequence[start_position: start_position + repeat_length]
+ if matching_region != repeat_region:
+ is_tandem_repeat = False
+ break
+ start_position += repeat_length
+ if is_tandem_repeat:
+ final_checking = True
+ break
+
+ return final_checking
+
+
+# employ hierarchical clustering, since data is only one dimensional, just check distance.
+# add lower bound to this clustering strategy, if distance larger than lower bound
+# , check if the sequence between two variant is repeat region (or so called tandem repeat)
+# using the program wrote for tandem repeat prediction.
+def clustering_snp(data_list, cluster_list, threshold, reference, lower_bound, refPos_snp, quePos_snp):
+
+ #r = refPos_snp[175243825]
+ #print r, r[0]
+ #q = quePos_snp[175243826]
+ #print q, q[0]
+
+ if len(data_list) < 1:
+ return
+ cluster_index = 0
+ previous_data = 0
+
+ for i in range(len(data_list)):
+ distance = data_list[i][0] - previous_data
+ #if data_list[i][0] == 175243838:
+ # print '@@@@@@@@@', previous_data, distance
+
+ if distance > threshold:
+ cluster_index += 1
+ else:
+ if distance > lower_bound:
+ subsequence = reference[previous_data: data_list[i][0]]
+ if not check_tandem_repeat(subsequence):
+ cluster_index += 1
+
+ cluster_list.append(cluster_index)
+ current_data = data_list[i][0]
+ data_source = data_list[i][1]
+ data_length = 1
+ if data_source > 0:
+ data_length = len(refPos_snp[current_data][1])
+ else:
+ data_length = len(quePos_snp[current_data][1])
+ current_data += data_length
+ if previous_data < current_data:
+ previous_data = current_data
+
+def main():
+ if len(sys.argv) == 1:
+ parser.print_help()
+ sys.exit()
+
+ if not os.path.isfile(args.reference):
+ print ("Error: reference file not found")
+ parser.print_help()
+ sys.exit()
+ if not os.path.isfile(args.query):
+ print ("Error: query vcf file not found.")
+ parser.print_help()
+ sys.exit()
+ if not os.path.isfile(args.genome):
+ print("Error: genome file not found.")
+ parser.print_help()
+ sys.exit()
+
+ report_head = '##genome=' + args.genome + '\n'
+ report_head += '##ref=' + args.reference + '\n'
+ report_head += '##query=' + args.query + '\n'
+ report_head += '##chr_name=chromosome name of this data\n'
+ report_head += '##ref_variant=matched variant from reference set\n'
+ report_head += '##query_variant=matched variants from query set, corresponding to ref_variant\n'
+ report_head += '##variants in both ref_variants and query_variants are separated by ";"\n'
+ report_head += '##each variant is a tuple<POS,REF,ALT> separated by ",", POS is 0-based position, REF is sequence in reference genome, ALT is corresponding allele in donor genome\n'
+ report_head += '##info=matching information, if directly matched, there will be "."; if >1 variants in ref_variants or query_variants, info will be subsequence from genome, and the modified subsequence by ref_variants and query_variants\n'
+ report_head += '#chr_name\tref_variants\tquery_variants\tinfo\n'
+
+ with open(args.output, 'w') as output:
+ output.write(report_head)
+
+ sequence = ""
+
+ print ('read genome file...')
+ seqFile = open(args.genome)
+ for line in seqFile.readlines():
+ if line.startswith(">"):
+ continue
+ line = line.strip()
+ sequence += line
+ seqFile.close()
+
+
+ ref_mismatch_file = open(args.false_negative, 'w')
+
+ print ('read reference vcf file...')
+ hash_refPos_snp = {}
+ #refPos_snp = RedBlackTreeMap()
+ refFile = open(args.reference)
+ for line in refFile.readlines():
+ if line.startswith("#"):
+ ref_mismatch_file.write(line)
+ continue
+ line = line.strip()
+ columns = line.split("\t")
+ pos = int(columns[1])-1
+ ref = columns[3]
+ alt = columns[4]
+ quality = columns[6]
+ if ',' in alt:
+ continue
+ snpType = 'S'
+ if len(ref) > len(alt):
+ snpType = 'D'
+ elif len(ref) < len(alt):
+ snpType = 'I'
+ #print pos, snpType, ref, alt
+ hash_refPos_snp[pos] = [snpType, ref, alt]
+ refPos_vcfEntry[pos] = line
+ #refPos_quality[pos] = quality
+ refFile.close()
+
+ ref_mismatch_file.close()
+
+ que_mismatch_file = open(args.false_positive, 'w')
+
+ print ('read query vcf file...')
+ hash_quePos_snp = {}
+ #quePos_snp = RedBlackTreeMap()
+ queFile = open(args.query)
+ for line in queFile.readlines():
+ if line.startswith("#"):
+ que_mismatch_file.write(line)
+ continue
+ line = line.strip()
+ columns = line.split("\t")
+ pos = int(columns[1])-1
+ ref = columns[3]
+ alt = columns[4]
+ if ',' in alt:
+ continue
+ snpType = 'S'
+ if len(ref) > len(alt):
+ snpType = 'D'
+ elif len(ref) < len(alt):
+ snpType = 'I'
+ hash_quePos_snp[pos] = [snpType, ref, alt]
+ quePos_vcfEntry[pos] = line
+ queFile.close()
+
+ que_mismatch_file.close()
+
+ refOriginalNum = len(hash_refPos_snp)
+ queOriginalNum = len(hash_quePos_snp)
+
+ print ('first stage start...')
+ if refOriginalNum > 0 and queOriginalNum > 0:
+ direct_search(hash_refPos_snp, hash_quePos_snp)
+
+ #print ("after direct search: ", len(hash_refPos_snp), len(hash_quePos_snp))
+
+ if args.direct_search:
+ report(hash_refPos_snp, hash_quePos_snp, refOriginalNum, queOriginalNum)
+ return
+
+ refPos_snp = RedBlackTreeMap()
+ quePos_snp = RedBlackTreeMap()
+
+ for k in hash_refPos_snp:
+ refPos_snp[k] = hash_refPos_snp[k]
+
+ for k in hash_quePos_snp:
+ quePos_snp[k] = hash_quePos_snp[k]
+
+ print ('second stage start...')
+
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ complex_search(refPos_snp, quePos_snp, sequence, False)
+
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ complex_search(quePos_snp, refPos_snp, sequence, True)
+ #print ("after complex search:", len(refPos_snp), len(quePos_snp))
+
+#=====================windows stage=============================================
+ '''
+ print ('third stage start...')
+ for block_size in [2, 4, 5,10,20,50,100,200]:
+ print ('try window size ' + str(block_size*2) + '...')
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ multi_search(refPos_snp, quePos_snp, sequence, block_size)
+ #print ('after multi search in ' + str(block_size) + ' bp range:', len(refPos_snp), len(quePos_snp))
+
+ report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+ '''
+#=====================windows stage============================================
+
+#=====================clustering stage=========================================
+ print ('start clustering...')
+ data_list = []
+ data_list_index_ref_que_dict = {}
+ #data_list_index_threshold = 0
+
+ for pos in refPos_snp:
+ pos_list = [pos, 1]
+ data_list.append(pos_list)
+ #data_list_index_ref_que_dict[pos] = 1
+ #data_list_index_threshold += 1
+
+ for pos in quePos_snp:
+ pos_list = [pos, -1]
+ data_list.append(pos_list)
+ #data_list_index_ref_que_dict[pos] = -1
+ #data_list_index += 1
+
+ data_list.sort()
+ #print data_list
+ cluster_list = []
+ #print data_list
+
+ #data = numpy.asarray(data_list)
+
+ thresh = 400
+ lower_bound = 10
+ clustering_snp(data_list, cluster_list, thresh, sequence, lower_bound, refPos_snp, quePos_snp)
+
+ #for i in range(len(data_list)):
+ # print data_list[i], cluster_list[i]
+
+ #print 'clustring...'
+ #clusters = hcluster.fclusterdata(data, thresh)
+
+ #print 'finish clustering'
+ #cluster_list = clusters.tolist()
+ #print 'finish to list'
+
+ # this is for verify the cluster results, make sure all pos in short distance is in a cluster.
+ """
+ previous_class = -1
+ previous_coordinate = -100000
+ min_distance = 100000
+ for k in range(len(cluster_list)):
+ if cluster_list[k] != previous_class:
+ current_distance = data_list[k][0] - previous_coordinate
+ if current_distance < min_distance:
+ min_distance = current_distance
+ previous_class = cluster_list[k]
+ previous_coordinate = data_list[k][0]
+
+ print ('end clustering...')
+ print ('min distance between clusters:', min_distance)
+ print ('number of clusters:', len(cluster_list))
+ """
+
+ cluster_search(refPos_snp, quePos_snp, data_list, cluster_list, sequence)
+
+ report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+#======================clustering stage==================================================
+ #exit()
+
+ '''
+ for i in range(len(data_list)):
+ data = data_list[i]
+ cluster = cluster_list[i]
+ pos = data[0]
+ category = data[1]
+ if category > 0:
+ if pos in refPos_snp:
+ print pos, cluster
+ else:
+ if pos in quePos_snp:
+ print pos, cluster
+ '''
+
+def test():
+ print check_tandem_repeat('AAACCAAAACCC')
+ print check_tandem_repeat('AAAAAAAAA')
+
+
+if __name__ == '__main__':
+ main()
diff --git a/py/vcfcompare_backup.py b/py/vcfcompare_backup.py
new file mode 100644
index 0000000..b3bd140
--- /dev/null
+++ b/py/vcfcompare_backup.py
@@ -0,0 +1,677 @@
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Author: Chen Sun(chensun at cse.psu.edu)
+"""
+
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+#elif sys.hexversion > 0x03000000:
+# print ("python 3")
+
+import subprocess
+import argparse
+import os
+import copy
+from lib.red_black_tree import RedBlackTreeMap
+
+citation = 'About algorithm used in VCF-Compare, please refer to "Method for Cross-Validating Variant Call Set" Section in our paper.'+'\n Please cite our paper.'
+
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-r', '--reference', required=True, help = 'reference vcf file path, usually larger than query vcf file')
+parser.add_argument('-q', '--query', required=True, help = 'query vcf file path')
+parser.add_argument('-g', '--genome', required=True, help= 'reference genome file path, fasta file format')
+parser.add_argument('-p', '--false_positive', help='false positive, i.e. mismatch vcf entries in query vcf file, default=false_positive.vcf', default='false_positive.vcf')
+parser.add_argument('-n', '--false_negative', help='false negative, i.e. mismatch vcf entries in reference vcf file, default=false_negative.vcf', default='false_negative.vcf')
+#parser.add_argument('-t', '--true_positive', help='true positive bed file position', default='true_positive.bed')
+parser.add_argument('-o', '--output', help='output matched variants in stage 2 and 3, default=multi_match.out', default='multi_match.out')
+parser.add_argument('-d', '--direct_search', help='if activate, only perform stage 1, default=not activate', action = 'store_true')
+parser.add_argument('-c', '--chr', help='chromosome name or id, used for parallel multi genome analysis', default='.')
+parser.add_argument('-s', '--stat', help='append statistics result into a file, useful for parallel multi genome analysis', default='stat.txt')
+args = parser.parse_args()
+
+#match_set = []
+
+#matched_quality_set = []
+#refPos_quality = {}
+######################### for debug ###########################
+refPos_vcfEntry = {}
+quePos_vcfEntry = {}
+#ref_match_total = set()
+#que_match_total = set()
+
+def direct_search(refPos_snp, quePos_snp):
+ global ref_match_total
+ global que_match_total
+ delList = []
+ num = 0
+ for key in quePos_snp:
+ if key in refPos_snp:
+ if refPos_snp[key] == quePos_snp[key]:
+ delList.append(key)
+ #match_set.append(key)
+ num += 1
+ #ref_match_total.add(key)
+ #que_match_total.add(key)
+ match_file = open('direct_search.txt', 'w')
+ for key in delList:
+ match_string = str(key) + ',' + str(refPos_snp[key]) + '\t' + str(key) + ',' + str(quePos_snp[key]) + '\n'
+ match_file.write(match_string)
+ #matched_quality_set.append(refPos_quality[key])
+ refPos_snp.pop(key, None) # delete value with key
+ quePos_snp.pop(key, None)
+ match_file.close()
+
+ #with open('matched_quality.txt', 'w') as quality:
+ # for q in matched_quality_set:
+ # quality.write(str(q)+'\n')
+
+ #print ("direct search found:", num)
+
+
+def modify_sequence(sequence, pos, snpSet):
+ if len(snpSet) != 3:
+ print ("Error: snp set size not right.")
+ ref = snpSet[1]
+ alt = snpSet[2]
+ if sequence[pos:pos+len(ref)].upper() != ref.upper():
+ pass
+ result = sequence[:pos] + alt + sequence[pos+len(ref):]
+ return result
+
+def near_search(refPos_snp, quePos_snp, genome, blockSize):
+
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+
+ num = 0
+ for key in quePos_snp:
+ num += 1
+ ref_element = refPos_snp.find_nearest(key) # return a position
+ ref_snp = ref_element.value()
+ que_snp = quePos_snp[key]
+ refPos = ref_element.key()
+ quePos = key
+
+ if abs(refPos-key) > blockSize:
+ continue
+ if ref_snp[0] != que_snp[0]:
+ continue
+
+ #get the substring
+ seqStart = min(key, refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ refIndex = refPos-seqStart
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+ if refSequence.upper() == queSequence.upper():
+ queRemoveList.append(quePos)
+ ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ output.write(match_string)
+ refPos_snp.pop(refPos, None)
+ break
+
+ output.close()
+ for pos in queRemoveList:
+ match_set.append(pos)
+ quePos_snp.pop(pos, None)
+
+def powerful_near_search(refPos_snp, quePos_snp, genome, blockSize):
+
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+ num = 0
+ for key in quePos_snp:
+ num += 1
+ #print num
+ minPos = max(key-blockSize, 0)
+ maxPos = min(key+blockSize, genomeLen-1)
+ que_snp = quePos_snp[key]
+ quePos = key
+ for (k,v) in refPos_snp.find_range(minPos, maxPos):
+
+ ref_snp = v
+ refPos = k
+ if ref_snp[0] != que_snp[0]:
+ continue
+
+ #get the substring
+ seqStart = min(key, refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ refIndex = refPos-seqStart
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+ if refSequence.upper() == queSequence.upper():
+ queRemoveList.append(quePos)
+ ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ output.write(match_string)
+ refPos_snp.pop(refPos, None)
+ break
+
+ output.close()
+ for pos in queRemoveList:
+ match_set.append(pos)
+ quePos_snp.pop(pos, None)
+
+def modify_by_list(pos_snp, posList, sequence, bound):
+ modList = copy.deepcopy(posList)
+ modList.sort(reverse=True)
+ for pos in modList:
+ snp = pos_snp[pos]
+ if len(snp) != 3:
+ print ("Error: snp set size not right.")
+ index = pos-bound
+ ref = snp[1]
+ alt = snp[2]
+ if sequence[index:index+len(ref)].upper() != ref.upper():
+ pass
+ sequence = sequence[:index] + alt + sequence[index+len(ref):]
+ return sequence
+
+def complex_search(refPos_snp, quePos_snp, genome, rev):
+ #global ref_match_total
+ #global que_match_total
+ queRemoveList = [] # record quePos that should be deleted
+ genomeLen = len(genome) # record genome length
+ output = open(args.output, 'a+') #open output file for
+ if refPos_snp is None:
+ print ("Error: refPos_snp is None")
+ if quePos_snp is None:
+ print ("Error: quePos_snp is None")
+ num = 0
+
+ start_position = None
+ for (key, value) in quePos_snp.find_range(None, None):
+ num += 1
+ que_snp = value
+ quePos = key
+ minPos = key
+ maxPos = min(key+len(que_snp[1])-1, genomeLen-1) + 1
+ candidateRefPos = []
+ candidateRefNode = []
+ temp_refPos_snp = {}
+ min_refPos = 3000000000
+ max_refPos = 0
+ for p in refPos_snp.linear_range_search(start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ if min_refPos > k:
+ min_refPos = k
+ if max_refPos < k:
+ max_refPos = k
+ candidateRefNode.append(p)
+ candidateRefPos.append(k)
+ temp_refPos_snp[k] = v
+ #get the substring
+ if len(candidateRefPos) == 0:
+ continue
+
+ before = refPos_snp.before(candidateRefNode[0])
+ while before is not None and before.key() + len(before.value()[1]) - 1 >= minPos:
+ #print ('find before boundary in stage 2')
+ candidateRefNode.insert(0, before)
+ min_refPos = before.key()
+ candidateRefPos.append(before.key())
+ temp_refPos_snp[before.key()] = before.value()
+ before = refPos_snp.before(candidateRefNode[0])
+
+ candidateRefPos.sort()
+ seqStart = min(key, min_refPos)-100
+ if seqStart < 0:
+ seqStart = 0
+ seqEnd = max(key, max_refPos) + 100
+ if seqEnd > genomeLen-1:
+ seqEnd = genomeLen-1
+ subSequence = genome[seqStart:seqEnd+1]
+ queIndex = quePos-seqStart
+
+ #modify string and then compare
+ refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, seqStart)
+ queSequence = modify_sequence(subSequence, queIndex, que_snp)
+
+ if refSequence.upper() == queSequence.upper():
+ #matched
+ start_position = refPos_snp.after(candidateRefNode[-1])
+ queRemoveList.append(quePos)
+ ref_variants = ''
+ query_variants = ''
+ if not rev:
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #ref_match_total.add(pos)
+ #be sure to recover
+ refPos_snp.pop(pos)
+ ref_pos = candidateRefPos[-1]
+ ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+
+ #ref_match_total.add(ref_pos)
+ # be sure to recover
+ refPos_snp.pop(ref_pos)
+
+ #multi_match_ref += 1
+ query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ else:
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ query_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #que_match_total.add(pos)
+ refPos_snp.pop(pos)
+ ref_pos = candidateRefPos[-1]
+ query_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+ #que_match_total.add(ref_pos)
+ refPos_snp.pop(ref_pos)
+ #multi_match_ref += 1
+ ref_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+ output_info = '{},{}'.format(subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+ output.write(match_string)
+ else:
+ start_position = candidateRefNode[0]
+ output.close()
+ for pos in queRemoveList:
+ #match_set.append(pos)
+ #que_match_total.add(pos)
+ quePos_snp.pop(pos, None)
+
+def multi_search(refPos_snp, quePos_snp, genome, blockSize):
+ #global ref_match_total
+ #global que_match_total
+ multi_match = 0
+ multi_match_ref = 0
+ multi_match_que = 0
+ one2multi = 0
+ multi2multi = 0
+
+ genomeLen = len(genome)
+ output = open(args.output, 'a+') #open output file for
+ refPosDelSet = set()
+ quePosDelSet = set()
+
+
+ #debug = False
+ ref_start_position = None
+ que_start_position = None
+ for key in quePos_snp.keys()[:]:
+
+ if not key in quePos_snp: # logN operation
+ continue
+
+ candidateRefPos = []
+ candidateQuePos = []
+ candidateRefNode = []
+ candidateQueNode = []
+ minPos = max(key-blockSize, 0)
+ maxPos = min(key+blockSize, genomeLen-1) + 1
+
+ temp_refPos_snp = {}
+ for p in refPos_snp.linear_range_search(ref_start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ candidateRefNode.append(p)
+ candidateRefPos.append(k)
+ temp_refPos_snp[k] = v
+
+ temp_quePos_snp = {}
+ for p in quePos_snp.linear_range_search(que_start_position, minPos, maxPos):
+ k = p.key()
+ v = p.value()
+ candidateQueNode.append(p)
+ candidateQuePos.append(k)
+ temp_quePos_snp[k] = v
+
+ if len(candidateQuePos) == 0:
+ print ("Error: query empty")
+ continue
+
+ if len(candidateRefPos) == 0:
+ continue
+ min_ref_pos = candidateRefPos[0]
+ max_ref_pos = candidateRefPos[-1] + len(temp_refPos_snp[candidateRefPos[-1]][1]) - 1
+
+ min_que_pos = candidateQuePos[0]
+ max_que_pos = candidateQuePos[-1] + len(temp_quePos_snp[candidateQuePos[-1]][1]) - 1
+
+ """
+ ref_before = refPos_snp.before(candidateRefNode[0])
+ que_before = quePos_snp.before(candidateQueNode[0])
+ while (ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos) or (que_before is not None and que_before.key() + len(que_before.value()[0])-1 > min_ref_pos):
+ #print ('find before boundary in stage 3')
+ if ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos :
+ candidateRefNode.insert(0, ref_before)
+ min_ref_pos = ref_before.key()
+ candidateRefPos.insert(0, ref_before.key())
+ temp_refPos_snp[ref_before.key()] = ref_before.value()
+ ref_before = refPos_snp.before(candidateRefNode[0])
+
+ if que_before is not None and que_before.key() + len(que_before.value()[0]) - 1 >= min_ref_pos :
+ candidateQueNode.insert(0, que_before)
+ min_que_pos = que_before.key()
+ candidateQuePos.insert(0, que_before.key())
+ temp_quePos_snp[que_before.key()] = que_before.value()
+ que_before = quePos_snp.before(candidateQueNode[0])
+
+ ref_after = refPos_snp.after(candidateRefNode[-1])
+ que_after = quePos_snp.after(candidateQueNode[-1])
+ while (ref_after is not None and ref_after.key() <= max_que_pos) or (que_after is not None and que_after.key() <= max_ref_pos):
+ #print ('find after boundary in stage 3')
+ if ref_after is not None and ref_after.key() <= max_que_pos :
+ candidateRefNode.append(ref_after)
+ max_ref_pos = ref_after.key() + len(ref_after.value()[1]) - 1
+ candidateRefPos.append(ref_after.key())
+ temp_refPos_snp[ref_after.key()] = ref_after.value()
+ ref_after = refPos_snp.after(candidateRefNode[-1])
+
+ if que_after is not None and que_after.key() <= max_ref_pos :
+ candidateQueNode.append(que_after)
+ max_que_pos = que_after.key() + len(que_after.value()[1]) - 1
+ candidateQuePos.append(que_after.key())
+ temp_quePos_snp[que_after.key()] = que_after.value()
+ que_after = quePos_snp.after(candidateQueNode[-1])
+
+ """
+ lowBound = candidateRefPos[0]
+ upperBound = candidateRefPos[-1]
+
+
+ if lowBound > candidateQuePos[0]:
+ lowBound = candidateQuePos[0]
+ if upperBound < candidateQuePos[-1]:
+ upperBound = candidateQuePos[-1]
+
+ lowBound = max(0, lowBound-100)
+ upperBound = min(upperBound+100, genomeLen-1)
+
+ subSequence = genome[lowBound: upperBound+1]
+
+ refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, lowBound)
+ queSequence = modify_by_list(temp_quePos_snp, candidateQuePos, subSequence, lowBound)
+
+ if refSequence.upper() == queSequence.upper():
+ #print ("multi_search works")
+ ref_start_position = refPos_snp.after(candidateRefNode[-1])
+ que_start_position = quePos_snp.after(candidateQueNode[-1])
+ ref_variants = ''
+ query_variants = ''
+ for index in range(len(candidateRefPos)-1):
+ pos = candidateRefPos[index]
+ ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+ #ref_match_total.add(pos)
+ refPos_snp.pop(pos)
+ multi_match_ref += 1
+ ref_pos = candidateRefPos[-1]
+ ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+ #ref_match_total.add(ref_pos)
+ refPos_snp.pop(ref_pos)
+ multi_match_ref += 1
+
+ for index in range(len(candidateQuePos)-1):
+ pos = candidateQuePos[index]
+ query_variants += '{},{},{};'.format(pos, temp_quePos_snp[pos][1], temp_quePos_snp[pos][2])
+ #que_match_total.add(pos)
+ quePos_snp.pop(pos)
+ #quePosList.remove(pos)
+ #match_set.append(pos)
+ multi_match_que += 1
+ #quePosDelSet.add(pos)
+ que_pos = candidateQuePos[-1]
+ query_variants += '{},{},{}'.format(que_pos, temp_quePos_snp[que_pos][1], temp_quePos_snp[que_pos][2])
+ #que_match_total.add(que_pos)
+ quePos_snp.pop(que_pos)
+ #quePosList.remove(que_pos)
+ #match_set.append(que_pos)
+ multi_match_que += 1
+ #quePosDelSet.add(que_pos)
+
+ output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+ match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+ output.write(match_string)
+ multi_match += 1
+
+ if len(candidateRefPos) == 1 or len(candidateQuePos) == 1:
+ one2multi += 1
+ else:
+ multi2multi += 1
+ else:
+ ref_start_position = candidateRefNode[0]
+ que_start_position = candidateQueNode[0]
+
+ output.close()
+ #print (multi_match, multi_match_ref, multi_match_que, one2multi, multi2multi)
+
+
+def report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum):
+ positiveFile = open(args.false_positive, "a+")
+ negativeFile = open(args.false_negative, "a+")
+
+ #query_mismatch_file = open(args.false_positive, 'w')
+ #ref_mismatch_file = open(args.false_negative, 'w')
+
+ #true_pos_file = open(args.true_positive, 'w')
+
+ refList = list(refPos_snp.keys())
+ refList.sort()
+ for pos in refList:
+ #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+ s = refPos_vcfEntry[pos] + '\n'
+ negativeFile.write(s)
+ negativeFile.close()
+
+ queList = list(quePos_snp.keys())
+ queList.sort()
+ for pos in queList:
+ #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+ s = quePos_vcfEntry[pos] + '\n'
+ positiveFile.write(s)
+ positiveFile.close()
+
+ #match_set.sort()
+ #for pos in match_set:
+ # s = args.chr + '\t' + str(pos) + '\t' + str(pos+1) + '\n'
+ # true_pos_file.write(s)
+ #true_pos_file.close()
+
+ print ('\n######### Matching Result ################\n')
+ print (' ref total: {}\n que total: {}\n ref matches: {}\n que matches: {}\n ref mismatch: {}\n alt mismatch: {}\n'.format(refOriginalNum, queOriginalNum,refOriginalNum-len(refPos_snp), queOriginalNum-len(quePos_snp) , len(refPos_snp), len(quePos_snp)))
+
+ stat_file = open(args.stat, 'a+')
+ stat_file.write('{}\t{}\t{}\t{}\t{}\n'.format(args.chr, refOriginalNum, queOriginalNum, refOriginalNum-len(refPos_snp), queOriginalNum-len(quePos_snp)))
+ stat_file.close()
+ #print (len(ref_match_total), len(que_match_total))
+ #print multi_match, multi_match_ref, multi_match_que
+
+
+def main():
+ if len(sys.argv) == 1:
+ parser.print_help()
+ sys.exit()
+
+##################################################################################
+ if not os.path.isfile(args.reference):
+ print ("Error: reference file not found")
+ parser.print_help()
+ sys.exit()
+ if not os.path.isfile(args.query):
+ print ("Error: query vcf file not found.")
+ parser.print_help()
+ sys.exit()
+ if not os.path.isfile(args.genome):
+ print("Error: genome file not found.")
+ parser.print_help()
+ sys.exit()
+
+ report_head = '##genome=' + args.genome + '\n'
+ report_head += '##ref=' + args.reference + '\n'
+ report_head += '##query=' + args.query + '\n'
+ report_head += '##chr_name=chromosome name of this data\n'
+ report_head += '##ref_variant=matched variant from reference set\n'
+ report_head += '##query_variant=matched variants from query set, corresponding to ref_variant\n'
+ report_head += '##variants in both ref_variants and query_variants are separated by ";"\n'
+ report_head += '##each variant is a tuple<POS,REF,ALT> separated by ",", POS is 0-based position, REF is sequence in reference genome, ALT is corresponding allele in donor genome\n'
+ report_head += '##info=matching information, if directly matched, there will be "."; if >1 variants in ref_variants or query_variants, info will be subsequence from genome, and the modified subsequence by ref_variants and query_variants\n'
+ report_head += '#chr_name\tref_variants\tquery_variants\tinfo\n'
+
+ with open(args.output, 'w') as output:
+ output.write(report_head)
+
+ sequence = ""
+
+ print ('read genome file...')
+ seqFile = open(args.genome)
+ for line in seqFile.readlines():
+ if line.startswith(">"):
+ continue
+ line = line.strip()
+ sequence += line
+ seqFile.close()
+
+
+ ref_mismatch_file = open(args.false_negative, 'w')
+
+ print ('read reference vcf file...')
+ hash_refPos_snp = {}
+ #refPos_snp = RedBlackTreeMap()
+ refFile = open(args.reference)
+ for line in refFile.readlines():
+ if line.startswith("#"):
+ ref_mismatch_file.write(line)
+ continue
+ line = line.strip()
+ columns = line.split("\t")
+ pos = int(columns[1])-1
+ ref = columns[3]
+ alt = columns[4]
+ quality = columns[6]
+ if ',' in alt:
+ continue
+ snpType = 'S'
+ if len(ref) > len(alt):
+ snpType = 'D'
+ elif len(ref) < len(alt):
+ snpType = 'I'
+ #print pos, snpType, ref, alt
+ hash_refPos_snp[pos] = [snpType, ref, alt]
+ refPos_vcfEntry[pos] = line
+ #refPos_quality[pos] = quality
+ refFile.close()
+
+ ref_mismatch_file.close()
+
+ que_mismatch_file = open(args.false_positive, 'w')
+
+ print ('read query vcf file...')
+ hash_quePos_snp = {}
+ #quePos_snp = RedBlackTreeMap()
+ queFile = open(args.query)
+ for line in queFile.readlines():
+ if line.startswith("#"):
+ que_mismatch_file.write(line)
+ continue
+ line = line.strip()
+ columns = line.split("\t")
+ pos = int(columns[1])-1
+ ref = columns[3]
+ alt = columns[4]
+ if ',' in alt:
+ continue
+ snpType = 'S'
+ if len(ref) > len(alt):
+ snpType = 'D'
+ elif len(ref) < len(alt):
+ snpType = 'I'
+ hash_quePos_snp[pos] = [snpType, ref, alt]
+ quePos_vcfEntry[pos] = line
+ queFile.close()
+
+ que_mismatch_file.close()
+
+ refOriginalNum = len(hash_refPos_snp)
+ queOriginalNum = len(hash_quePos_snp)
+
+ print ('first stage start...')
+ if refOriginalNum > 0 and queOriginalNum > 0:
+ direct_search(hash_refPos_snp, hash_quePos_snp)
+
+ #print ("after direct search: ", len(hash_refPos_snp), len(hash_quePos_snp))
+
+ if args.direct_search:
+ report(hash_refPos_snp, hash_quePos_snp, refOriginalNum, queOriginalNum)
+ return
+
+ refPos_snp = RedBlackTreeMap()
+ quePos_snp = RedBlackTreeMap()
+
+ for k in hash_refPos_snp:
+ refPos_snp[k] = hash_refPos_snp[k]
+
+ for k in hash_quePos_snp:
+ quePos_snp[k] = hash_quePos_snp[k]
+
+ print ('second stage start...')
+
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ complex_search(refPos_snp, quePos_snp, sequence, False)
+
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ complex_search(quePos_snp, refPos_snp, sequence, True)
+ #print ("after complex search:", len(refPos_snp), len(quePos_snp))
+
+ print ('third stage start...')
+ for block_size in [2, 4, 5,10,20,50,100,200]:
+ print ('try window size ' + str(block_size*2) + '...')
+ if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+ multi_search(refPos_snp, quePos_snp, sequence, block_size)
+ #print ('after multi search in ' + str(block_size) + ' bp range:', len(refPos_snp), len(quePos_snp))
+
+ report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+
+if __name__ == '__main__':
+ main()
diff --git a/script/add_marker.py b/script/add_marker.py
new file mode 100644
index 0000000..e69de29
diff --git a/script/compare_match.py b/script/compare_match.py
new file mode 100644
index 0000000..7a90f60
--- /dev/null
+++ b/script/compare_match.py
@@ -0,0 +1,44 @@
+from sys import argv
+
+baseline_filename = argv[1]
+query_filename = argv[2]
+
+baseline_pos_line = {}
+baseline_pos_content = {}
+
+query_pos_line = {}
+query_pos_content = {}
+
+def read_file(filename):
+ pos_line = {}
+ pos_content = {}
+ with open (filename) as f:
+ for line in f:
+ if(line.startswith('#')):
+ continue
+ line = line.strip()
+ columns = line.split('\t')
+ content = '\t'.join(columns[1:-1])
+ pos = int(columns[1])
+ pos_line[pos] = line
+ pos_content[pos] = content
+ return pos_line, pos_content
+
+
+(baseline_pos_line, baseline_pos_content)= read_file(baseline_filename)
+(query_pos_line, query_pos_content)= read_file(query_filename)
+
+for pos in baseline_pos_content:
+ if pos not in query_pos_content:
+ print pos, "exist in baseline but not in query"
+ print baseline_pos_line[pos]
+ else:
+ if baseline_pos_content[pos] != query_pos_content[pos]:
+ print "same pos but not equal content"
+ print baseline_pos_line[pos]
+ print query_pos_line[pos]
+
+for pos in query_pos_content:
+ if pos not in baseline_pos_content:
+ print pos, "in query but not in baseline"
+ print query_pos_line[pos]
\ No newline at end of file
diff --git a/script/count_decomposed_matching.py b/script/count_decomposed_matching.py
new file mode 100644
index 0000000..faedac4
--- /dev/null
+++ b/script/count_decomposed_matching.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Chen Sun(chensun at cse.psu.edu)
+ Paul Medvedev(pashadag at cse.psu.edu)
+"""
+
+from sys import argv
+
+decomposed_filename = argv[1]
+
+matching_filename = argv[2]
\ No newline at end of file
diff --git a/script/direct_match.py b/script/direct_match.py
new file mode 100644
index 0000000..70b45cd
--- /dev/null
+++ b/script/direct_match.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Chen Sun(chensun at cse.psu.edu)
+ Paul Medvedev(pashadag at cse.psu.edu)
+"""
+
+from sys import argv
+
+baseline_filename = argv[1]
+
+query_filename = argv[2]
+
+baseline_variant = {}
+
+query_variant = {}
\ No newline at end of file
diff --git a/script/filter_hc.py b/script/filter_hc.py
new file mode 100644
index 0000000..d9f10eb
--- /dev/null
+++ b/script/filter_hc.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Paul Medvedev(pashadag at cse.psu.edu)
+ Chen Sun(chensun at cse.psu.edu)
+"""
+
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+from intervaltree import Interval, IntervalTree
+
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+chr_2_index = {}
+index_2_chr = {}
+
+for i in range(22):
+ chr_2_index[str(i+1)] = i
+ index_2_chr[i] = str(i+1)
+
+chr_2_index['X'] = 22
+index_2_chr[22] = 'X'
+
+chr_2_index['Y'] = 23
+index_2_chr[23] = 'Y'
+
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="filter_hc", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('vcf_files', nargs='+', metavar='File List', help='VCF file list')
+parser.add_argument('-b', '--bed_file', help='bed hc file', metavar='File')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./nhc')
+args = parser.parse_args()
+
+if not os.path.exists(args.output):
+ os.mkdir(args.output)
+
+interval_tree_list = []
+
+for i in range(24):
+ interval_tree_list.append(IntervalTree())
+
+with open (args.bed_file) as bed:
+ for line in bed.readlines():
+ if line.startswith('#'):
+ continue
+ columns = line.split('\t')
+ chr_name = columns[0]
+ if chr_name not in chr_2_index:
+ print 'BED: ' + line
+ continue
+ chr_index = chr_2_index[chr_name]
+ start_p = int(columns[1])
+ end_p = int(columns[2]) # 0 based, exclude end position
+ interval_tree_list[chr_index][start_p: end_p] = (start_p, end_p)
+
+for vcf_filename in args.vcf_files:
+ match_basename = os.path.basename(vcf_filename)
+ nhc_filename = args.output + '/' + match_basename + '.nhc.vcf'
+
+ output_list = []
+ with open(vcf_filename) as vcf_file:
+ for line in vcf_file.readlines():
+ if line.startswith('#'):
+ output_list.append(line)
+ continue
+ columns = line.split('\t')
+ chr_name = columns[0]
+ if chr_name not in chr_2_index:
+ output_list.append(line)
+ continue
+ chr_index = chr_2_index[chr_name]
+ var_pos = int(columns[1]) - 1 # 1 based system to 0 based system
+ query_result = interval_tree_list[chr_index][var_pos]
+ if len(query_result) == 0:
+ output_list.append(line)
+
+ nhc_file = open(nhc_filename, 'w')
+ for line in output_list:
+ nhc_file.write(line)
+ nhc_file.close()
\ No newline at end of file
diff --git a/script/filter_lcr.py b/script/filter_lcr.py
new file mode 100644
index 0000000..d9f10eb
--- /dev/null
+++ b/script/filter_lcr.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Paul Medvedev(pashadag at cse.psu.edu)
+ Chen Sun(chensun at cse.psu.edu)
+"""
+
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+from intervaltree import Interval, IntervalTree
+
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+chr_2_index = {}
+index_2_chr = {}
+
+for i in range(22):
+ chr_2_index[str(i+1)] = i
+ index_2_chr[i] = str(i+1)
+
+chr_2_index['X'] = 22
+index_2_chr[22] = 'X'
+
+chr_2_index['Y'] = 23
+index_2_chr[23] = 'Y'
+
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="filter_hc", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('vcf_files', nargs='+', metavar='File List', help='VCF file list')
+parser.add_argument('-b', '--bed_file', help='bed hc file', metavar='File')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./nhc')
+args = parser.parse_args()
+
+if not os.path.exists(args.output):
+ os.mkdir(args.output)
+
+interval_tree_list = []
+
+for i in range(24):
+ interval_tree_list.append(IntervalTree())
+
+with open (args.bed_file) as bed:
+ for line in bed.readlines():
+ if line.startswith('#'):
+ continue
+ columns = line.split('\t')
+ chr_name = columns[0]
+ if chr_name not in chr_2_index:
+ print 'BED: ' + line
+ continue
+ chr_index = chr_2_index[chr_name]
+ start_p = int(columns[1])
+ end_p = int(columns[2]) # 0 based, exclude end position
+ interval_tree_list[chr_index][start_p: end_p] = (start_p, end_p)
+
+for vcf_filename in args.vcf_files:
+ match_basename = os.path.basename(vcf_filename)
+ nhc_filename = args.output + '/' + match_basename + '.nhc.vcf'
+
+ output_list = []
+ with open(vcf_filename) as vcf_file:
+ for line in vcf_file.readlines():
+ if line.startswith('#'):
+ output_list.append(line)
+ continue
+ columns = line.split('\t')
+ chr_name = columns[0]
+ if chr_name not in chr_2_index:
+ output_list.append(line)
+ continue
+ chr_index = chr_2_index[chr_name]
+ var_pos = int(columns[1]) - 1 # 1 based system to 0 based system
+ query_result = interval_tree_list[chr_index][var_pos]
+ if len(query_result) == 0:
+ output_list.append(line)
+
+ nhc_file = open(nhc_filename, 'w')
+ for line in output_list:
+ nhc_file.write(line)
+ nhc_file.close()
\ No newline at end of file
diff --git a/script/overlap.py b/script/overlap.py
new file mode 100644
index 0000000..90673e0
--- /dev/null
+++ b/script/overlap.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Chen Sun(chensun at cse.psu.edu)
+ Paul Medvedev(pashadag at cse.psu.edu)
+"""
+
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+from matplotlib_venn import venn2, venn2_circles
+
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="overlap", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File List', help='.match file list, should take the same VCF as baseline')
+args = parser.parse_args()
+
+baselinevar_num = {}
+# key is baseline variant, identified by chr_pos_ref_alt_phasing
+variant_num_list = []
+
+
+def read_file(filename):
+ global baselinevar_num
+ global variant_num
+ variant_num = 0
+ with open(filename) as f:
+ for line in f:
+ if line.startswith('#'):
+ continue
+ line = line.strip()
+ columns = line.split('\t')
+ chrname = columns[0]
+ baseline_columns = columns[4].split(';')
+ if baseline_columns[0] == '.':
+ baseline_key = chrname + ',' + columns[1]# + ',' + columns[2] + ',' + columns[3]
+ #print baseline_key
+ if baseline_key in baselinevar_num:
+ baselinevar_num[baseline_key] += 1
+ else:
+ baselinevar_num[baseline_key] = 1
+ variant_num += 1
+ continue
+
+ for baseline in baseline_columns:
+ baseline_key = chrname + ',' + baseline.split(',')[0]
+ #print baseline_key
+ if baseline_key in baselinevar_num:
+ baselinevar_num[baseline_key] += 1
+ else:
+ baselinevar_num[baseline_key] = 1
+ variant_num += 1
+ variant_num_list.append(variant_num)
+
+for filename in args.match_files:
+ read_file(filename)
+
+overlap_threshold = len(args.match_files)
+
+print overlap_threshold
+overlap_num = 0
+for baseline_key in baselinevar_num:
+ if baselinevar_num[baseline_key] >= overlap_threshold:
+ overlap_num += 1
+
+print overlap_num, variant_num_list
+
+# Subset sizes
+s = (
+ variant_num_list[0]-overlap_num, # Ab
+ variant_num_list[1]-overlap_num, # aB
+ overlap_num, # AB
+)
+
+v = venn2(subsets=s, set_labels=('bwa-fb', 'pt'))
+
+# Subset labels
+v.get_label_by_id('10').set_text(format(s[0], ',d'))
+v.get_label_by_id('01').set_text(format(s[1], ',d'))
+v.get_label_by_id('11').set_text(format(s[2], ',d'))
+
+# Subset colors
+#v.get_patch_by_id('10').set_color('red')
+#v.get_patch_by_id('01').set_color('yellow')
+#v.get_patch_by_id('11').set_color('blue')
+
+# Subset alphas
+#v.get_patch_by_id('10').set_alpha(0.4)
+#v.get_patch_by_id('01').set_alpha(1.0)
+#v.get_patch_by_id('11').set_alpha(0.7)
+
+for text in v.set_labels:
+ text.set_fontsize(32)
+for text in v.subset_labels:
+ text.set_fontsize(32)
+
+# Border styles
+c = venn2_circles(subsets=s, linestyle='solid', linewidth='0')
+#c[0].set_ls('dashed') # Line style
+#c[0].set_lw(2.0) # Line width
+
+#plt.show()
+plt.tight_layout()
+plt.savefig('./vm_venn.png')
diff --git a/script/overlap_direct.py b/script/overlap_direct.py
new file mode 100644
index 0000000..ee7c6e0
--- /dev/null
+++ b/script/overlap_direct.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Chen Sun(chensun at cse.psu.edu)
+ Paul Medvedev(pashadag at cse.psu.edu)
+"""
+
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+import matplotlib
+matplotlib.use('agg')
+import matplotlib.pyplot as plt
+from matplotlib_venn import venn2, venn2_circles
+
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="overlap", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File List', help='.match file list, should take the same VCF as baseline')
+args = parser.parse_args()
+
+baselinevar_num = {}
+# key is baseline variant, identified by chr_pos_ref_alt_phasing
+variant_num_list = []
+
+
+def read_file(filename):
+ global baselinevar_num
+ global variant_num
+ variant_num = 0
+ with open(filename) as f:
+ for line in f:
+ if line.startswith('#'):
+ continue
+ line = line.strip()
+ columns = line.split('\t')
+ chrname = columns[0]
+ #baseline_columns = columns[4].split(';')
+
+ baseline_key = chrname + ',' + columns[1]# + ',' + columns[2] + ',' + columns[3]
+ #print baseline_key
+ if baseline_key in baselinevar_num:
+ baselinevar_num[baseline_key] += 1
+ else:
+ baselinevar_num[baseline_key] = 1
+ variant_num += 1
+
+ variant_num_list.append(variant_num)
+
+for filename in args.match_files:
+ read_file(filename)
+
+overlap_threshold = len(args.match_files)
+
+print overlap_threshold
+overlap_num = 0
+for baseline_key in baselinevar_num:
+ if baselinevar_num[baseline_key] >= overlap_threshold:
+ overlap_num += 1
+
+print overlap_num, variant_num_list
+
+# Subset sizes
+s = (
+ variant_num_list[0]-overlap_num, # Ab
+ variant_num_list[1]-overlap_num, # aB
+ overlap_num, # AB
+)
+
+v = venn2(subsets=s, set_labels=('bwa-fb', 'pt'))
+
+# Subset labels
+v.get_label_by_id('10').set_text(format(s[0], ',d'))
+v.get_label_by_id('01').set_text(format(s[1], ',d'))
+v.get_label_by_id('11').set_text(format(s[2], ',d'))
+
+# Subset colors
+#v.get_patch_by_id('10').set_color('red')
+#v.get_patch_by_id('01').set_color('yellow')
+#v.get_patch_by_id('11').set_color('blue')
+
+# Subset alphas
+#v.get_patch_by_id('10').set_alpha(0.4)
+#v.get_patch_by_id('01').set_alpha(1.0)
+#v.get_patch_by_id('11').set_alpha(0.7)
+
+for text in v.set_labels:
+ text.set_fontsize(32)
+for text in v.subset_labels:
+ text.set_fontsize(32)
+
+# Border styles
+c = venn2_circles(subsets=s, linestyle='solid', linewidth='0')
+#c[0].set_ls('dashed') # Line style
+#c[0].set_lw(2.0) # Line width
+
+#plt.show()
+plt.tight_layout()
+plt.savefig('./vt_venn.png')
diff --git a/script/varmatch b/script/varmatch
new file mode 100644
index 0000000..1b4bc6c
--- /dev/null
+++ b/script/varmatch
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+
+import sys
+import subprocess
+import argparse
+import os
+import time
+
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+versionError = 'You are using an old version of python, please upgrade to python 2.7+\n'
+
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+citation = 'Please cite our paper'
+
+parser = argparse.ArgumentParser(epilog=citation)
+parser.add_argument('-r', metavar='reference.vcf', help='reference vcf file path')
+parser.add_argument('-q', metavar='query.vcf', help='query vcf file path')
+parser.add_argument('-g', metavar='genome.fa', help='genome sequence file path, FASTA file format')
+parser.add_argument('-t', metavar='N', default='1', help='thread number for parallel')
+parser.add_argument('-n', '--normalize', action='store_true',
+ help='if activate, VarMatch will normalize reference vcf and query vcf file before comparing.')
+#parser.add_argument('-d', '--direct_search', action='store_true', help='if activate, only perform direct matching')
+parser.add_argument('--multi_genome', metavar='genome_list.txt',
+ help='genome list file contain chromosome name and FASTA file absolute path')
+parser.add_argument('--multi_vcf', nargs='+', metavar='file.vcf ...',
+ help='vcf files (usually more than two) that need to compare')
+parser.add_argument('-o', '--output', metavar='output/', help='output directory, default is the current directory')
+#parser.add_argument('-v', '--visualize', help='visualize results')
+parser.add_argument('--purify', action='store_true',
+ help='if activate, VarMatch will check if variant matches reference genome sequence.')
+parser.add_argument('-H', action='store_true',
+ help='if active, VarMatch will not match haplotype')
+#parser.add_argument('--remove_dup', metavar='single.vcf',
+# help='provide single vcf file, remove duplications in this vcf file')
+
+# strategy for whole genome:
+# since current version of vt-normalize can not handle whole genome sequence data
+# vcfcompare split whole genome data according to chromosome detected in --whole-genome directory
+args = parser.parse_args()
+
+human_chromosome_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
+ '18', '19', '20', '21', '22', 'X', 'Y']
+
+
+def shell_run(command, hide=False):
+ if not RUN:
+ time.sleep(3.5)
+ print(command)
+ else:
+ # print(command)
+ if hide: # hide output
+ FNULL = open(os.devnull, 'w')
+ subprocess.call(command, shell=True, stdout=FNULL, stderr=subprocess.STDOUT)
+ # subprocess.call(command, shell=True, stdout=FNULL)
+ FNULL.close()
+ else:
+ subprocess.call(command, shell=True)
+
+
+def check_command(command):
+ """
+ check if corresponding command available
+ """
+ if os.path.isfile(command):
+ return True
+
+ for cmdpath in os.environ['PATH'].split(':'):
+ if os.path.isdir(cmdpath) and command in os.listdir(cmdpath):
+ return True
+ return False
+
+
+# [todo] check vcf files, corresponding genome file should exist
+
+
+# purify vcf file
+def purify(input_file, output_file, genome_file):
+ global check_purify_command
+ if not check_purify_command and not check_command(purify_tool):
+ print ('Error: can not find program: ' + purify_tool)
+ print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+ exit()
+ else:
+ check_purify_command = True
+ purify_command = purify_tool + ' -i ' + input_file + ' -g ' + genome_file + ' -o ' + output_file
+ shell_run(purify_command)
+
+
+def pairwise_compare(reference_file, query_file, genome_file, output_prefix):
+ global check_compare_command
+ if not check_compare_command and not check_command(compare_tool):
+ print ('Error: can not find program: ' + compare_tool)
+ print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+ exit()
+ else:
+ check_compare_command = True
+ compare_command = compare_tool + ' -r ' + reference_file + ' -q ' + query_file + ' -g ' + genome_file + ' -o ' + output_prefix
+
+ if args.H:
+ compare_command += ' -G '
+
+ if args.normalize:
+ compare_command += ' -n '
+
+ if args.t is not None and int(args.t) > 1:
+ compare_command += ' -t ' + args.t
+ shell_run(compare_command)
+
+
+def varmatch_pairwise(reference_file, query_file, genome_file, output_directory):
+ ref_basename = os.path.basename(reference_file)
+ que_basename = os.path.basename(query_file)
+
+ ref_purify_file = temp_dir + '/' + ref_basename + '.purify.vcf'
+ que_purify_file = temp_dir + '/' + que_basename + '.purify.vcf'
+ if args.purify:
+ purify(reference_file, ref_purify_file, genome_file)
+ purify(query_file, que_purify_file, genome_file)
+ else:
+ ref_purify_file = reference_file
+ que_purify_file = query_file
+ output_prefix = output_directory + '/' + ref_basename + '_' + que_basename
+ pairwise_compare(ref_purify_file, que_purify_file, genome_file, output_prefix)
+ return output_prefix
+
+
+def detect_multi_genome(genome_list_file, chr_list):
+ genome_dict = {}
+ with open(genome_list_file) as f:
+ for line in f.readlines():
+ line = line.strip()
+ columns = line.split()
+ chr_name = columns[0]
+ if chr_name in chr_list:
+ genome_dict[chr_name] = columns[1]
+ return genome_dict
+
+
+def split_multi_genome(vcf_file, detected_chr_list):
+ print ('Split variant file according to chromosomes...')
+ vcf_name_dict = {}
+ basename = os.path.basename(vcf_file)
+ for c in detected_chr_list:
+ vcf_name_dict[c] = temp_dir + '/' + basename + '.' + c + '.vcf'
+
+ vcf_handle_dict = {}
+ for c in detected_chr_list:
+ vcf_handle_dict[c] = open(vcf_name_dict[c], 'w')
+
+ with open(vcf_file) as f:
+ for line in f.readlines():
+ if line.startswith('#'):
+ for c in detected_chr_list:
+ vcf_handle_dict[c].write(line)
+ continue
+ for c in detected_chr_list:
+ chromosome_name = line.split('\t')[0]
+ if chromosome_name == c or chromosome_name == 'chr' + c:
+ vcf_handle_dict[c].write(line)
+ break
+
+ for c in detected_chr_list:
+ vcf_handle_dict[c].close()
+ return vcf_name_dict
+
+
+def varmatch_multi_genome(reference_file, query_file, genome_list_file):
+ # split vcf according to chromosome and then use varmatch_pairwise
+ genome_dict = detect_multi_genome(genome_list_file, human_chromosome_list)
+ detected_chr_list = list(genome_dict.keys())
+ detected_chr_list.sort()
+ print ('\t[Multiple genome mode]')
+ print ('\tDetected genomes:')
+ chr_list_string = '\t'
+ # print (chr_list_string, detected_chr_list)
+ for c in detected_chr_list:
+ chr_list_string += c + ','
+ chr_list_string = chr_list_string[:-1] + '\n'
+ print (chr_list_string)
+ for c in detected_chr_list:
+ if not os.path.isfile(genome_dict[c]):
+ print ('[Error:] Can not find genome file ' + genome_dict[c])
+ exit()
+
+ ref_vcf_dict = split_multi_genome(reference_file, detected_chr_list)
+ que_vcf_dict = split_multi_genome(query_file, detected_chr_list)
+
+ for c in detected_chr_list:
+ print('Matching chromosome ' + c + '...')
+ varmatch_pairwise(ref_vcf_dict[c], que_vcf_dict[c], genome_dict[c], temp_dir)
+
+ chr_stat_dict = {}
+ total_stat = [0] * 6 # total_ref, total_que, matched_ref, matched_que, mismatch_ref, mismatch_que
+
+ for c in detected_chr_list:
+ stat_file = temp_dir + '/' + os.path.basename(ref_vcf_dict[c]) + '_' + os.path.basename(
+ que_vcf_dict[c]) + '.stat'
+ chr_stat_dict[c] = stat_file
+ with open(stat_file) as f:
+ lines = f.readlines()
+ for i in range(len(total_stat)):
+ if i < len(total_stat):
+ total_stat[i] += int(lines[i].strip())
+
+ total_stat_filename = output_dir + '/' + os.path.basename(reference_file) + '_' + os.path.basename(
+ query_file) + '.stat'
+ with open(total_stat_filename, 'w') as f:
+ for s in total_stat:
+ f.write(str(s))
+ f.write('\n')
+
+ total_complex_filename = output_dir + '/' + os.path.basename(reference_file) + '_' + os.path.basename(
+ query_file) + '.match'
+ total_complex_file = open(total_complex_filename, 'w')
+ total_complex_file.write('##VCF1:'+reference_file+'\n')
+ total_complex_file.write('##VCF2:'+query_file+'\n')
+ total_complex_file.write('#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\n')
+ for c in detected_chr_list:
+ chr_complex_file = temp_dir + '/' + os.path.basename(ref_vcf_dict[c]) + '_' + os.path.basename(
+ que_vcf_dict[c]) + '.match'
+ with open(chr_complex_file) as complex_f:
+ chr_content = complex_f.readlines()
+ for line in chr_content:
+ if line.startswith('#'):
+ continue
+ total_complex_file.write(line)
+ total_complex_file.close()
+
+
+def varmatch_multi_vcf_multi_genome(multi_vcf_list, genome_list_file):
+ genome_dict = detect_multi_genome(genome_list_file, human_chromosome_list)
+ detected_chr_list = list(genome_dict.keys())
+ detected_chr_list.sort()
+ print ('\t[Multiple genome multiple vcf mode]')
+ print ('\tDetected genomes:')
+ chr_list_string = '\t'
+ for c in detected_chr_list:
+ chr_list_string += c + ','
+ chr_list_string = chr_list_string[:-1] + '\n'
+ print (chr_list_string)
+ for c in detected_chr_list:
+ if not os.path.isfile(genome_dict[c]):
+ print ('[VarMatch:Error:] Can not find genome file ' + genome_dict[c])
+ exit()
+
+ id_vcf_dict = {}
+ for i in range(len(multi_vcf_list)):
+ if not os.path.isfile(multi_vcf_list[i]):
+ print ('[VarMatch:Error:] Can not find vcf file ' + multi_vcf_list[i])
+ exit()
+ id_vcf_dict[i] = multi_vcf_list[i]
+ vcfid_list = list(id_vcf_dict.keys())
+ vcfid_list.sort()
+
+ vcfid_chr_vcfsplit_dict = {} # this is dict of dict, key is id, value is a dict with key as chr, value as vcf
+ for vcfid in vcfid_list:
+ vcffile = id_vcf_dict[vcfid]
+ vcfsplit_dict = split_multi_genome(vcffile, detected_chr_list)
+ vcfid_chr_vcfsplit_dict[vcfid] = vcfsplit_dict
+
+ chr_prefix_dict = {}
+ for c in detected_chr_list:
+ # create id_singlechrvcf_dict
+ id_singlechrvcf_dict = {}
+ for vcfid in vcfid_chr_vcfsplit_dict:
+ id_singlechrvcf_dict[vcfid] = vcfid_chr_vcfsplit_dict[vcfid][c]
+ output_prefix = temp_dir + '/common.' + c
+ chr_prefix_dict[c] = output_prefix
+ varmatch_multi_vcf_single_genome(id_singlechrvcf_dict, genome_dict[c], output_prefix)
+
+ # merge multi chromosome
+ common_filename = output_dir + '/common.match'
+ common_file = open(common_filename, 'w')
+ # write vcf file names
+ # write title
+ for id in vcfid_list:
+ common_file.write('##VCF' + str(id + 1) + ':' + id_vcf_dict[id] + '\n')
+
+ head_line = '#CHROM\tPOS\tREF\tALT'
+ for id in vcfid_list:
+ head_line += '\tVCF' + str(id + 1)
+ head_line += '\n'
+ common_file.write(head_line)
+
+ for c in chr_prefix_dict:
+ with open(chr_prefix_dict[c] + '.match') as f:
+ for line in f.readlines():
+ if line.startswith('#'):
+ continue
+ common_file.write(line)
+ common_file.close()
+
+
+def varmatch_multi_vcf_single_genome(id_vcf_dict, genome_file, output_prefix):
+ """
+ id_vcf_dict key: id, value: vcf file
+ """
+
+ id_finalname_dict = {}
+ finalname_id_dict = {}
+ id_list = list(id_vcf_dict.keys())
+ id_list.sort()
+
+ for id in id_list:
+ vcf_file = id_vcf_dict[id]
+ if not os.path.isfile(vcf_file):
+ print('Error: Can not open vcf file ' + vcf_file)
+ vcf_file_id = id
+ basename = os.path.basename(vcf_file)
+
+ purify_file = temp_dir + '/' + basename + '.purify.vcf'
+ if args.purify:
+ purify(vcf_file, purify_file, genome_file)
+ else:
+ purify_file = vcf_file
+
+ id_finalname_dict[vcf_file_id] = purify_file
+
+ pairwise_prefix_idtuple = {}
+ idtuple_pairwise_prefix = {}
+ for i in range(len(id_list) - 1):
+ ref_id = id_list[i]
+ que_id = id_list[i + 1]
+ id_tuple = (ref_id, que_id)
+ ref_filename = id_finalname_dict[ref_id]
+ que_filename = id_finalname_dict[que_id]
+
+ pairwise_prefix = varmatch_pairwise(ref_filename, que_filename, genome_file, temp_dir)
+ pairwise_prefix_idtuple[pairwise_prefix] = id_tuple
+ idtuple_pairwise_prefix[id_tuple] = pairwise_prefix
+
+ # summarize
+ # [todo] summarize simple matches
+ # summarize complex matches
+ # [todo] summarize matching number
+ # variantid = position + ref.toupper + alt.toupper
+ variantid_variant = {}
+ variantid_info = {}
+ for i in range(len(id_list) - 1):
+ ref_id = id_list[i]
+ que_id = id_list[i + 1]
+ id_tuple = (ref_id, que_id)
+ pairwise_prefix = idtuple_pairwise_prefix[id_tuple]
+ complex_match_file = pairwise_prefix + '.match'
+ if not os.path.isfile(complex_match_file):
+ print('Error: Can not open match result ' + complex_match_file)
+ with open(complex_match_file) as f:
+ for line in f.readlines():
+ if line.startswith('#'):
+ continue
+ line = line.strip()
+ columns = line.split('\t')
+ variantid = '@'.join(columns[1:4])
+ if i == 0:
+ variantid_variant[variantid] = columns[:4]
+ variantid_info[variantid] = columns[4:]
+ else:
+ if variantid in variantid_info:
+ variantid_info[variantid].append(columns[-1])
+ # else:
+ # print variantid
+
+ integrate_complex_filename = output_prefix + '.match'
+ integrate_complex_file = open(integrate_complex_filename, 'w')
+ for i in range(len(id_list)):
+ head_line = '##VCF' + str(i + 1) + ':' + id_vcf_dict[i] + '\n'
+ integrate_complex_file.write(head_line)
+
+ head_line = '#CHROM\tPOS\tREF\tALT'
+ for i in range(len(id_list)):
+ head_line += '\tVCF' + str(i + 1)
+ integrate_complex_file.write(head_line)
+ # print '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
+ common_complex_num = 0
+ for variantid in sorted(variantid_info):
+ if len(variantid_info[variantid]) == len(id_list):
+ common_complex_num += 1
+ merge_list = variantid_variant[variantid] + variantid_info[variantid]
+ variant_line = '\t'.join(merge_list)
+ variant_line += '\n'
+ integrate_complex_file.write(variant_line)
+ # else:
+ # print variantid
+ integrate_complex_file.close()
+
+ print(common_complex_num)
+
+
+def remove_duplicate(genome_filename, single_vcf_filename, output_prefix):
+ print ("current version does not support remvoe duplicate")
+ exit()
+ remove_duplicate_command = compare_tool + ' -m ' + single_vcf_filename + ' -g ' + genome_filename + ' -o ' + output_prefix
+
+ if args.t is not None and int(args.t) > 1:
+ remove_duplicate_command += ' -t ' + args.t
+ shell_run(remove_duplicate_command)
+
+
+def main():
+ if len(sys.argv) < 2:
+ parser.print_help()
+ exit()
+
+ # initialize global variables
+ global check_purify_command
+ global check_normalize_command
+ global check_compare_command
+
+ global script_path
+ global purify_tool
+ global compare_tool
+ global output_dir
+ global visual_dir
+ global temp_dir
+
+ check_purify_command = False
+ check_normalize_command = False
+ check_compare_command = True
+
+ script_path = sys.path[0]
+ purify_tool = script_path + '/purify'
+ compare_tool = script_path + '/vm'
+ output_dir = ''
+ visual_dir = ''
+ temp_dir = ''
+
+ # create output directory
+ if args.output is None or args.output == '':
+ output_dir = os.getcwd() + '/output'
+ else:
+ output_dir = args.output
+ if output_dir == '':
+ output_dir = os.getcwd() + '/output'
+ if not os.path.exists(output_dir):
+ os.mkdir(output_dir)
+
+ temp_dir = output_dir + '/temp'
+ visual_dir = output_dir + '/visualization'
+
+ if not os.path.exists(temp_dir):
+ os.mkdir(temp_dir)
+
+ # print args.r, args.q
+ if args.remove_dup is not None and args.remove_dup != '':
+ if not os.path.isfile(args.remove_dup):
+ print ('\tError in remove duplication mode:\n')
+ print ('\tCan not find vcf file: ' + args.remove_dup)
+ basename = os.path.basename(args.remove_dup)
+ output_prefix = output_dir + '/' + basename + '.nodup'
+ remove_duplicate(args.g, args.remove_dup, output_prefix)
+
+ if args.multi_genome is not None and args.multi_genome != '':
+ if args.multi_vcf is not None:
+ # multi genome, multi vcf
+ varmatch_multi_vcf_multi_genome(args.multi_vcf, args.multi_genome)
+ pass
+ elif args.remove_dup is not None:
+ # multi genome, single vcf(remove duplicates)
+ pass
+ else:
+ # pure multi genome, to compare two genome
+ varmatch_multi_genome(args.r, args.q, args.multi_genome)
+ elif args.multi_vcf is not None:
+ # multi vcf, single chromosome
+ output_prefix = output_dir + '/common'
+ id_vcf_dict = {}
+ id = 0
+ for vcf_file in args.multi_vcf:
+ id_vcf_dict[id] = vcf_file
+ id += 1
+ varmatch_multi_vcf_single_genome(id_vcf_dict, args.g, output_prefix)
+ elif args.remove_dup is not None:
+ pass
+ else:
+ # single chromosome, pairwise compare
+ varmatch_pairwise(args.r, args.q, args.g, output_dir)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/diploid.cpp b/src/diploid.cpp
new file mode 100644
index 0000000..9b5470b
--- /dev/null
+++ b/src/diploid.cpp
@@ -0,0 +1,3562 @@
+// code
+// author: Chen Sun, chensun at cse.psu.edu
+#include "diploid.h"
+
+// inline function protected
+// code reviewed by Channing
+
+
+inline bool CompareSequence(string s1, string s2) {
+ transform(s1.begin(), s1.end(), s1.begin(), ::toupper);
+ transform(s2.begin(), s2.end(), s2.begin(), ::toupper);
+ return s1 == s2;
+}
+
+inline bool PrefixMatch( std::string const& lhs, std::string const& rhs )
+{
+ return std::equal(
+ lhs.begin(),
+ lhs.begin() + std::min( lhs.size(), rhs.size() ),
+ rhs.begin() );
+}
+
+DiploidVCF::DiploidVCF(int thread_num_):VCF(thread_num_)
+{
+ scoring_basepair = false;
+ dout << "DiploidVCF() Thread Number: " << thread_num << endl;
+}
+
+DiploidVCF::~DiploidVCF()
+{
+}
+
+// private
+int DiploidVCF::ReadRefVCF(string filename) {
+ return ReadDiploidVCF(filename, ref_variant_list, 0);
+}
+
+// private
+int DiploidVCF::ReadQueryVCF(string filename) {
+ return ReadDiploidVCF(filename, que_variant_list, 1);
+}
+// protected
+// [todo] unit test normalization
+// normalization modifies vt normalize algorithm
+// code reviewed by Channing 4/2/2016
+bool DiploidVCF::NormalizeDiploidVariant(DiploidVariant & var) {
+ int pos = var.pos;
+ string parsimonious_ref = var.ref;
+ string parsimonious_alt0 = var.alts[0];
+ string parsimonious_alt1 = var.alts[0];
+ if (var.heterozygous && var.multi_alts)
+ parsimonious_alt1 = var.alts[1];
+
+ int left_index = pos;
+ if (genome_sequence.size() == 0) return false;
+ if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+ if (toupper(genome_sequence[left_index]) != toupper(parsimonious_ref[0])) {
+ dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+ return false;
+ }
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt0.back()) && toupper(parsimonious_ref.back()) == toupper(parsimonious_alt1.back())) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+ parsimonious_ref.pop_back();
+ parsimonious_alt0.pop_back();
+ parsimonious_alt1.pop_back();
+ change_in_allels = true;
+ }
+ // else do not make further changes
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+ left_index--;
+ char left_char = toupper(genome_sequence[left_index]);
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt0 = left_char + parsimonious_alt0;
+ parsimonious_alt1 = left_char + parsimonious_alt1;
+ }
+ }
+ while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt0[0]) &&
+ toupper(parsimonious_ref[0]) == toupper(parsimonious_alt1[0]) &&
+ parsimonious_ref.size() > 1 &&
+ parsimonious_alt0.size() > 1 &&
+ parsimonious_alt1.size() > 1)
+ {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt0.erase(0, 1);
+ parsimonious_alt1.erase(0, 1);
+ left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+ }
+ var.pos = left_index;
+ var.ref = parsimonious_ref;
+ var.alts[0] = parsimonious_alt0;
+ if (var.heterozygous && var.multi_alts)
+ var.alts[1] = parsimonious_alt1;
+ return true;
+}
+
+int DiploidVCF::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1) {
+
+ int left_index = pos;
+ if (genome_sequence.size() == 0) return -1;
+ if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+ parsimonious_ref.pop_back();
+ parsimonious_alt0.pop_back();
+ parsimonious_alt1.pop_back();
+ change_in_allels = true;
+ }
+ // else do not make further changes
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+ left_index--;
+ char left_char = toupper(genome_sequence[left_index]);
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt0 = left_char + parsimonious_alt0;
+ parsimonious_alt1 = left_char + parsimonious_alt1;
+ }
+ }
+ while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+ parsimonious_ref[0] == parsimonious_alt1[0] &&
+ parsimonious_ref.size() > 1 &&
+ parsimonious_alt0.size() > 1 &&
+ parsimonious_alt1.size() > 1)
+ {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt0.erase(0, 1);
+ parsimonious_alt1.erase(0, 1);
+ left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+ }
+ return left_index;
+}
+
+void DiploidVCF::ReadGenome(string filename) {
+ ifstream genome_file;
+ genome_file.open(filename.c_str());
+ if (!genome_file.good()) {
+ cout << "[VarMatch] can not open FASTA file: ";
+ cout << filename << endl;
+ return;
+ }
+ genome_sequence = "";
+ while(!genome_file.eof()) {
+ string line;
+ getline(genome_file, line, '\n');
+ if ((int)line.length() <= 1) continue;
+ if (line[0] == '>') continue;
+ genome_sequence += line;
+ }
+ genome_file.close();
+ return;
+}
+
+// protected
+// code reviewed by Channing and Succulent on 4/2/2016
+int DiploidVCF::ReadDiploidVCF(string filename, vector<DiploidVariant> & x_variant_list, int flag) {
+ // read and change all sequence to upper case
+ int total_num = 0;
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return -1;
+ }
+ int genotype_index = -1;
+ char genotype_separator = '/';
+ //int genome_sequence_length = genome_sequence.length();
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ //if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#') {
+// if (line[1] == '#') continue;
+// auto head_names = split(line, '\t');
+// if (head_names.size() < 10 && match_genotype) {
+// cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+// cout << "[VarMatch] \tVCF file name " << filename << endl;
+// cout << "[VarMatch] \tAutomatically turn off genotype matching module." << endl;
+// match_genotype = false;
+// }
+ continue;
+ }
+ auto columns = split(line, '\t');
+ if (columns.size() < 10) {
+ if(match_genotype){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+ match_genotype = false;
+ continue;
+ }
+ if(columns.size() < 6){
+ cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+ cout << "[VarMatch] skip current variant: " << line << endl;
+ continue;
+ }
+ }
+ if (chromosome_name == ".") chromosome_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+
+// if(pos == 79240316){
+// cout << "find snp from: " << flag << endl;
+// }
+ auto ref = columns[3];
+
+ if(ref.size() >= VAR_LEN) continue;
+ auto alt_line = columns[4];
+ auto quality = columns[5];
+
+ ToUpper(ref);
+ ToUpper(alt_line);
+
+ bool is_heterozygous_variant = false;
+ bool is_multi_alternatives = false;
+
+ if (columns.size() >= 10) {
+ if (genotype_index < 0) {
+ auto formats = split(columns[8], ':');
+ for (int i = 0; i < formats.size(); i++) {
+ if (formats[i] == "GT") {
+ genotype_index = i;
+ break;
+ }
+ }
+ if(genotype_index < 0){
+ cout << "[VarMatch] VCF entry does not contain genotype information" << endl;
+ continue;
+ }
+ }
+ auto additionals = split(columns[9], ':');
+ vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+
+ if(genotype_columns.size() != 2){
+ genotype_separator = '|';
+ genotype_columns = split(additionals[genotype_index], genotype_separator);
+ }
+
+ // normalize format of genotype: sorted, separated by |
+ if (genotype_columns.size() != 2) {
+ cout << "[VarMatch] Warning Unrecognized Genotype: " << additionals[genotype_index] << endl;
+ continue;
+ }
+ else {
+ if (genotype_columns[0] != genotype_columns[1]) {
+ is_heterozygous_variant = true;
+ }
+ }
+
+ if (genotype_columns[1] == "0" && genotype_columns[0] == "0" && match_genotype) {
+ continue;
+ }
+ }
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos) {
+ alt_list = split(alt_line, ',');
+ if(alt_list[0].size() >= VAR_LEN || alt_list[1].size() >= VAR_LEN) continue;
+ is_multi_alternatives = true;
+ }
+ else {
+ if(alt_line.size() >= VAR_LEN) continue;
+ alt_list.push_back(alt_line);
+ }
+
+ int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+ int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+ if(is_multi_alternatives){
+ snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+ snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+ }
+
+ DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag);
+ if (normalization) {
+ NormalizeDiploidVariant(dv);
+ }
+ x_variant_list.push_back(dv);
+
+ total_num++;
+ }
+ vcf_file.close();
+ return total_num;
+}
+
+// protected override
+// code reviewed by Channing and Succulent on 4/2/2016
+void DiploidVCF::DecideBoundaries() {
+ int genome_size = genome_sequence.size();
+
+ if(genome_size == 0){
+ dout << "[VarMatch] Warning: no genome sequence detected when decide boundries. " << endl;
+ }
+
+ int distance = genome_size / thread_num;
+ for (int i = 0; i < thread_num - 1; i++) {
+ pos_boundries.push_back((i + 1)*distance);
+ }
+ pos_boundries.push_back(genome_size);
+
+ for (int i = 0; i < thread_num; i++) {
+ refpos_2_var.push_back(unordered_map<int, DiploidVariant>());
+ querypos_2_var.push_back(unordered_map<int, DiploidVariant>());
+ }
+
+ boundries_decided = true;
+}
+
+//private
+void DiploidVCF::DirectSearchInThread(unordered_map<int, DiploidVariant> & ref_snps,
+ unordered_map<int, DiploidVariant> & query_snps,
+ int thread_index) {
+ // handle heterozygous variants
+ auto rit = ref_snps.begin();
+ auto rend = ref_snps.end();
+ for (; rit != rend;) {
+ auto r_pos = rit->first;
+ DiploidVariant r_var = rit->second;
+ auto qit = query_snps.find(r_pos);
+ if (qit != query_snps.end()) {
+ DiploidVariant q_var = qit->second;
+ if (r_var == q_var) {
+ string matching_result = chromosome_name + '\t' + to_string(r_var.pos + 1) + "\t" + r_var.ref + "\t";
+ auto alt_string = r_var.alts[0];
+ if (r_var.multi_alts)
+ alt_string += "," + r_var.alts[1];
+ matching_result += alt_string;
+ direct_match_records[thread_index]->push_back(matching_result);
+ rit = ref_snps.erase(rit);
+ query_snps.erase(qit);
+ }
+ else {
+ ++rit;
+ }
+ }
+ else {
+ ++rit;
+ }
+ }
+}
+
+// directly match by position
+// private
+void DiploidVCF::DirectSearchMultiThread() {
+
+ direct_match_records = new vector<string>*[thread_num];
+ for (int j = 0; j < thread_num; j++) {
+ direct_match_records[j] = new vector<string>;
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ threads.push_back(thread(&DiploidVCF::DirectSearchInThread, this, ref(refpos_2_var[i]), ref(querypos_2_var[i]), i));
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ DirectSearchInThread(refpos_2_var[i], querypos_2_var[i], i);
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ threads.clear();
+
+ ofstream output_simple_file;
+ output_simple_file.open(output_simple_filename);
+ output_simple_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_simple_file << "##VCF2:" << que_vcf_filename << endl;
+ output_simple_file << "#CHROM\tPOS\tREF\tALT" << endl;
+ for (int i = 0; i < thread_num; i++) {
+ for (int j = 0; j < direct_match_records[i]->size(); j++) {
+ output_simple_file << direct_match_records[i]->at(j) << endl;
+ }
+ }
+ output_simple_file.close();
+ for (int j = 0; j < thread_num; j++) {
+ delete direct_match_records[j];
+ }
+ delete[] direct_match_records;
+}
+
+bool DiploidVCF::RecurrentVariantMatch(vector<DiploidVariant> & variant_list, int thread_index) {
+ sort(variant_list.begin(), variant_list.end());
+ map<int, DiploidVariant> separate_pos_var[2];
+ bool separate_contians_indel[2];
+ // separate into ref and que
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set or query set
+ int pos = variant_list[i].pos;
+ separate_pos_var[flag][pos] = variant_list[i];
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+
+ if (ref_sequence.length() != alt_sequences[0].length())
+ separate_contians_indel[flag] = true;
+ if (variant_list[i].multi_alts) {
+ if (ref_sequence.length() != alt_sequences[1].length()) {
+ separate_contians_indel[flag] = true;
+ }
+ }
+ }
+
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length());
+
+ if (!separate_contians_indel[0] && !separate_contians_indel[1]) {
+ // There is no way that there will be a match
+ return false;
+ }
+ if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+ return false;
+ }
+
+ string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+ int offset = min_pos;
+ // 0 for ref, 1 for query, same as flag
+ map<int, int> choices[4];
+ for(int i = 0; i < 2; i++){
+ for(int j = 0; j < 2; j++){
+ for(auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it){
+ auto pos = it->first;
+ choices[i*2+j][pos] = -1;
+ }
+ }
+ }
+ map<int, int> max_matches[4];
+ string max_paths[2];
+ int max_score = 0;
+ RecurrentMatchWithIndel(variant_list,
+ subsequence,
+ offset,
+ 0,
+ separate_pos_var,
+ choices,
+ max_matches,
+ max_score,
+ max_paths);
+ if (max_score == 0) {
+ return false;
+ }
+
+ // matched, print out matches
+ bool multiple_match = true;
+ if (CompareSequence(max_paths[1], subsequence) || CompareSequence(max_paths[1], max_paths[0])) {
+ multiple_match = false;
+ }
+ string alt_record = max_paths[0];
+ if (multiple_match)
+ alt_record += "/" + max_paths[1];
+ string match_record = chromosome_name + "\t" + to_string(offset) + "\t" + subsequence + "\t" + alt_record;
+ string vcf_record[2] = { "" };
+ string phase_record[4] = { "" };
+
+ map<int, bool> separate_pos_matched[2];
+ for (int i = 0; i < 2; i++) {
+ for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+ separate_pos_matched[i][it->first] = false;
+ }
+ }
+ for(int i = 0; i < 2; i++){
+ // i= 0 ref, =1 alt
+ for(int j = 0; j < 2; j++){
+ auto c = max_matches[i*2+j];
+ for(auto it = c.begin(); it !=c.end(); ++it){
+ if (it->second > 0) {
+ separate_pos_matched[i][it->first] = true;
+ }
+ }
+ }
+ }
+ for (auto it = separate_pos_matched[0].begin(); it != separate_pos_matched[0].end(); ++it) {
+ if (it->second) {
+ complex_ref_match_num[thread_index] ++;
+ }
+ }
+ for (auto it = separate_pos_matched[1].begin(); it != separate_pos_matched[1].end(); ++it) {
+ if (it->second) {
+ complex_que_match_num[thread_index] ++;
+ }
+ }
+ for (int i = 0; i < 2; i++) {
+ auto final_iter = separate_pos_matched[i].end();
+ --final_iter;
+ for (auto it = separate_pos_matched[i].begin(); it != separate_pos_matched[i].end(); ++it) {
+ if (it->second) {
+ int pos = it->first;
+ DiploidVariant variant = separate_pos_var[i][pos];
+ string alt1_string = variant.alts[0];
+ if (variant.multi_alts) {
+ alt1_string = variant.alts[1];
+ }
+ else if(! variant.heterozygous) {
+ alt1_string = variant.ref;
+ }
+ string variant_record = to_string(pos) + "," + variant.ref + "," + variant.alts[0];
+ if (multiple_match)
+ variant_record += "/" + alt1_string;
+ vcf_record[i] += variant_record;
+ //cout << pos << ":" << max_matches[i*2+1][pos] << endl;
+ phase_record[i * 2] += to_string(max_matches[i * 2][pos]);
+ phase_record[i * 2 + 1] += to_string(max_matches[i * 2 + 1][pos]);
+ if (it != final_iter) {
+ vcf_record[i] += ";";
+ phase_record[i * 2] += ",";
+ phase_record[i * 2 + 1] += ",";
+ }
+ }
+ }
+ }
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ if (multiple_match) {
+ match_record += "\t" + phase_record[0] + "/" + phase_record[1] + "\t" + phase_record[2] + "/" + phase_record[3];
+ }
+ else {
+ match_record += "\t.\t.";
+ }
+ match_record += "\t" + to_string(max_score) + "\n";
+ cout << match_record ;
+
+ for (int i = 0; i < 2; i++)
+ {
+ if (i == 0) {
+ cout << "ref: ";
+ }
+ else {
+ cout << "alt: ";
+ }
+ cout << separate_pos_var[i].size() << endl;
+ for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+ auto v = it->second;
+ cout << v.pos << "," << v.ref << "," << v.alts[0];
+ if (v.multi_alts) {
+ cout << v.alts[1];
+ }
+ cout << ";";
+ }
+ cout << endl;
+ }
+ cout << endl;
+
+ complex_match_records[thread_index]->push_back(match_record);
+ return true;
+}
+
+void DiploidVCF::RecurrentMatchWithIndel(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var [],
+ map<int, int> choices [], // 4 vectors
+ map<int, int> max_matches[], // 4 vectors
+ int & max_score,
+ string max_paths[]) {
+
+ string cur_paths[2];
+ int prefix_match = CheckPrefix(subsequence, offset, separate_pos_var, choices, cur_paths);
+ if (prefix_match < 0) return;
+ // if prefix_match == 0, just prefix match
+ if (prefix_match > 0) { // sequence direct match
+ int score = prefix_match;
+ if (max_score < score) {
+ //cout << "higher score: " << score << endl;
+ max_score = score;
+ for (int i = 0; i < 4; i++) {
+ max_matches[i] = choices[i];
+ }
+ for (int i = 0; i < 2; i++) {
+ max_paths[i] = cur_paths[i];
+ }
+ }
+ }
+ if (index >= variant_list.size()) return;
+ auto variant = variant_list[index];
+ int flag = variant.flag;
+ int pos = variant.pos;
+ int choice_end = 1;
+ if (variant.multi_alts) choice_end = 2;
+ for (int choice = 0; choice <= choice_end; choice++) {
+ if(pos == 0){
+ dout << "error pos = 0 " << endl;
+ }
+ choices[flag * 2][pos] = choice;
+ if (choice == 0) {
+ choices[flag * 2 + 1][pos] = 0;
+ }
+ else if (choice == 1) { // include
+ if (variant.multi_alts) { // if multi_alts, then the other alleles should be included
+ choices[flag * 2 + 1][pos] = 2;
+ }
+ else if (variant.heterozygous) { // if heterozygous but not multi_alts, then reference should be included
+ choices[flag * 2 + 1][pos] = 0;
+ }
+ else { // homozygous one
+ choices[flag * 2 + 1][pos] = 1;
+ }
+ }
+ else {
+ choices[flag * 2 + 1][pos] = 1;
+ }
+
+ RecurrentMatchWithIndel(variant_list,
+ subsequence,
+ offset,
+ index + 1,
+ separate_pos_var,
+ choices,
+ max_matches,
+ max_score,
+ max_paths);
+
+ choices[flag * 2][pos] = -1;
+ choices[flag * 2 + 1][pos] = -1;
+ }
+}
+
+// check if prefix match or equal
+int DiploidVCF::CheckPrefix(const string subsequence,
+ const int offset,
+ map<int, DiploidVariant> separate_pos_var[],
+ map<int, int> choices[],
+ string cur_paths[])
+{
+ string paths[4] = { "" }; // 0 and 1 are ref, 2 and 3 are query path
+ // create 4 paths
+ for (int i = 0; i < 2; i++) {
+ // create
+ for (int j = 0; j < 2; j++) {
+ int index = i*2 + j;
+ map<int, int> pos_choice = choices[index];
+ string path = "";
+ int start_pos = 0;
+ auto it = pos_choice.begin();
+ for (; it != pos_choice.end(); ++it) {
+ int pos = it->first;
+ int choice = it->second;
+ auto variant = separate_pos_var[i][pos];
+ string ref = variant.ref;
+ auto alts = variant.alts;
+ int offset_pos = pos - offset;
+ if (offset_pos < start_pos) {
+ //return -1;
+ }
+ else if (offset_pos > start_pos) {
+ path += subsequence.substr(start_pos, offset_pos - start_pos);
+ }
+ if(choice < 0)
+ break;
+ if (choice == 0) {
+ path += ref;
+ }
+ else if (choice == 1) {
+ path += alts[0];
+ }
+ else {
+ path += alts[1];
+ }
+ start_pos = max(start_pos, offset_pos + (int)ref.length());
+ }
+ if(it == pos_choice.end()){
+ if(start_pos < subsequence.length()){
+ path += subsequence.substr(start_pos, subsequence.length()-start_pos);
+ }
+ }
+ paths[index] = path;
+ }
+ }
+
+ // check prefix match
+ int const comb[2][4] = {
+ {1,3,2,4},
+ {1,4,2,3}
+ };
+
+ bool prefix_match = false;
+ bool direct_match = false;
+ for (int i = 0; i < 2; i++) {
+ bool check_prefix_match[2] = { false };
+ bool check_direct_match[2] = { false };
+ for (int k = 0; k < 2; k++) {
+ string s1 = paths[comb[i][k * 2]-1];
+ string s2 = paths[comb[i][k * 2 + 1]-1];
+ int min_len = min(s1.length(), s2.length());
+ string s1_sub = s1.substr(0, min_len);
+ string s2_sub = s2.substr(0, min_len);
+ check_prefix_match[k] = CompareSequence(s1_sub, s2_sub);
+ check_direct_match[k] = CompareSequence(s1, s2);
+ }
+ if (check_prefix_match[0] && check_prefix_match[1])
+ prefix_match = true;
+ if (check_direct_match[0] && check_direct_match[1])
+ direct_match = true;
+ }
+ if (direct_match) {
+ for(int i = 0; i < 4; i++){
+ dout << paths[i] << endl;
+ }
+ dout << endl;
+
+ int score = 0;
+ for (int i = 0; i < 2; i++) {
+ cur_paths[i] = paths[i];
+ auto pos_var = separate_pos_var[i];
+ for (auto it = pos_var.begin(); it != pos_var.end(); ++it) {
+ if(choices[i*2][it->first] <= 0 && choices[i*2+1][it->first] <= 0){
+ continue;
+ }
+ if (scoring_basepair) {
+ score += it->second.ref.length();
+ }
+ else {
+ score += 1;
+ }
+ }
+ }
+ return score;
+ }
+ if (prefix_match) return 0;
+ return -1;
+}
+
+// code reviewed by Channing 4/3/2016
+vector<vector<vector<int>>> DiploidVCF::Combine(vector<int> & positions, vector<bool> & multi_indicators, int k) {
+ vector<vector<int>> sol;
+ vector<vector<vector<int>>> all_sol;
+ if (k == 0 || k > positions.size()) {
+ return all_sol;
+ }
+ FindComb(positions,
+ multi_indicators,
+ 0,
+ k,
+ sol,
+ all_sol);
+ return all_sol;
+}
+
+// code review by Channing 4/3/2016
+// [TODO] unit test
+void DiploidVCF::FindComb(vector<int> & positions,
+ vector<bool> & multi_indicators,
+ int start,
+ int k,
+ vector<vector<int> > & sol,
+ vector<vector<vector<int>>> & all_sol)
+{
+ if (k == 0) {
+ all_sol.push_back(sol);
+ return;
+ }
+ int n = positions.size();
+ for (int i = start; i <= n - k; i++) {
+ sol.push_back(vector<int>({ positions[i], 0 }));
+ FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+ sol.pop_back();
+ if (multi_indicators[i]) { // try second allele
+ sol.push_back(vector<int>({ positions[i], 1 }));
+ FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+ sol.pop_back();
+ }
+ }
+}
+
+// code reviewed by Chen on 4/4/2016
+bool DiploidVCF::VariantMatch(vector<DiploidVariant> & variant_list, int thread_index) {
+ if(variant_list.size() <= 1) return false;
+ sort(variant_list.begin(), variant_list.end());
+ map<int, DiploidVariant> separate_pos_var[2];
+
+ // separate into ref and que
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_pos_var[flag][pos] = variant_list[i];
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+ //dout << pos << "," << ref_sequence << "," << alt_sequences[0] << "," << flag << endl;
+ }
+
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length());
+
+ if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+ return false;
+ }
+
+ string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+ int offset = min_pos;
+ vector<vector<int>> max_choices[4]; // -1 for ref, 0 for alts[0], 1 for alts[1] (only applied to multi_alts)
+ string max_paths[2];
+ int max_score = 0;
+ bool max_heterozygosity = false;
+ FindBestDiploidMatch(variant_list,
+ subsequence,
+ offset,
+ 0,
+ separate_pos_var,
+ max_choices,
+ max_score,
+ max_heterozygosity,
+ max_paths);
+
+ if (max_score == 0) {
+ return false;
+ }
+
+ // matched, print out matches
+ bool multiple_match = max_heterozygosity;
+ if(! match_genotype) multiple_match = false;
+
+ vector<string> alt_list;
+ alt_list.push_back(max_paths[0]);
+ if(multiple_match)
+ alt_list.push_back(max_paths[1]);
+ DiploidVariant dv(offset, subsequence, alt_list, true, multiple_match);
+ //NormalizeDiploidVariant(dv);
+
+ string alt_record = dv.alts[0];
+ if (multiple_match)
+ alt_record += "/" + dv.alts[1];
+ string match_record = chromosome_name + "\t" + to_string(dv.pos+1) + "\t" + dv.ref + "\t" + alt_record;
+ string vcf_record[2] = { "" };
+ string phase_record[4] = { "" };
+
+ complex_ref_match_num[thread_index] += max_choices[0].size();
+ complex_que_match_num[thread_index] += max_choices[2].size();
+
+ for (int i = 0; i < 2; i++) {
+ auto final_iter = max_choices[i*2].size()-1;
+ for (int k = 0; k < max_choices[i*2].size(); k++) {
+ int pos = max_choices[i*2][k][0];
+ DiploidVariant variant = separate_pos_var[i][pos];
+ string alt1_string = variant.alts[0];
+ if (variant.multi_alts) {
+ alt1_string = variant.alts[1];
+ }
+ else if (variant.heterozygous) {
+ alt1_string = variant.ref;
+ }
+ string variant_record = to_string(pos+1) + "," + variant.ref + "," + variant.alts[0];
+ if (multiple_match)
+ variant_record += "/" + alt1_string;
+ vcf_record[i] += variant_record;
+ //cout << pos << ":" << max_matches[i*2+1][pos] << endl;
+ if(multiple_match){
+ phase_record[i * 2] += to_string(max_choices[i*2][k][1]+1);
+ phase_record[i * 2 + 1] += to_string(max_choices[i * 2 + 1][k][1]+1);
+ }
+ if (k != final_iter) {
+ vcf_record[i] += ";";
+ if(multiple_match){
+ phase_record[i * 2] += ",";
+ phase_record[i * 2 + 1] += ",";
+ }
+ }
+ }
+ }
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ if (multiple_match) {
+ match_record += "\t" + phase_record[0] + "/" + phase_record[1] + "\t" + phase_record[2] + "/" + phase_record[3];
+ }
+ else {
+ match_record += "\t.\t.";
+ }
+ match_record += "\t" + to_string(max_score) + "\n";
+
+ complex_match_records[thread_index]->push_back(match_record);
+ return true;
+}
+
+void PrintSelection(VariantSelection selection){
+ cout << "$ Selection: $" << endl;
+ cout << "\t genome position:" << selection.genome_position[0] << "," << selection.genome_position[1] << endl;
+ for(int i = 0; i < 2; i++){
+ for(int k =0; k < selection.pos_vectors[i].size(); k++){
+ cout << "\t" << selection.pos_vectors[i][k] << ":" << selection.phasing_vectors[i][k] << "," ;
+ }
+ cout << endl;
+ }
+ for(int i = 0; i < 4; i++){
+ cout << selection.donor_sequences[i] << "," ;
+ }
+ cout << endl;
+}
+
+void DiploidVCF::PrintVariant(DiploidVariant var){
+ cout << "-Variant:-" << endl;
+ cout << var.flag << "," << var.pos << "," << var.ref << "," << var.alts[0];
+ if(var.multi_alts) cout << "/" << var.alts[1];
+ cout << endl;
+}
+
+void PrintSelectionsList(list<VariantSelection> variant_selections){
+ cout << "==========Selections List==================" <<endl;
+ cout << variant_selections.size() << endl;
+ for(auto it = variant_selections.begin(); it!= variant_selections.end(); ++it){
+ VariantSelection selection = *it;
+ PrintSelection(selection);
+ }
+}
+
+// code review by Chen on 04/15/2016 and unit test
+// if time consuming, change to the same algorithm as RTG
+int DiploidVCF::CheckDonorSequences(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ string donor_sequences[]){
+ // if score == 0, do not bother to collapse
+ //if(selection.score == 0) return -1;
+
+ // so here the new donor checking algorithm does not make sense
+
+ // haplotype indicates the haplotype used in D_0
+ // the other haplotype need to calculate
+ // haplotype == -1, all add ref
+ // haplotype == 0, D_0 add alts[0], D_1 add alts[1] if multi_alts, add ref if heterozygous, add alts[0] otherwise
+ // haplotype == 1, D_0 add alts[1] if multi_alts, add ref otherwise, D_1 add alts[0]
+
+ // first, decide substr of genome sequence that be applied
+ // genome sequence that is
+ int genome_position[2] = {-1, -1};
+ int cut_length[2] = {-1, -1};
+ int pos_lower_bound[2] = {-1, -1}; // exclusive
+ int pos_upper_bound[2] = {-1, -1}; // exclusive
+
+ int variant_num[2];
+ for(int i = 0; i < 2; i++){
+ variant_num[i] = (int)selection.phasing_vectors[i].size();
+
+ if(variant_num[i] == 0){
+ pos_lower_bound[i] = -1;
+ }else{
+ DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+ pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+ }
+
+ if(variant_num[i] < separate_var_list[i].size()){
+ pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+ }else{
+ if(selection.separate_score[i] == 0){
+ return -1;
+ }
+ pos_upper_bound[i] = (int)subsequence.length();
+ }
+ }
+
+ if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+ genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+ genome_position[1] = genome_position[0];
+ }else{
+ genome_position[0] = pos_upper_bound[0];
+ genome_position[1] = pos_upper_bound[1];
+ }
+
+ cut_length[0] = subsequence.length() - genome_position[0];
+ cut_length[1] = subsequence.length() - genome_position[1];
+
+ // here first decide reference sequence for apply
+ for(int i = 0; i < 2; i++){
+ donor_sequences[i*2] = subsequence;
+ donor_sequences[i*2+1] = subsequence;
+ }
+
+ for(int i = 0; i < 2; i++){
+ for(int k = (int)selection.phasing_vectors[i].size() - 1; k >= 0; k--){
+ int temp_phasing = selection.phasing_vectors[i][k];
+ if(temp_phasing == -1){
+ continue;
+ }
+ DiploidVariant temp_var = separate_var_list[i][k];
+ int temp_pos = temp_var.pos;
+ int temp_end = temp_pos + temp_var.ref.length();
+ int relative_end = temp_end - offset;
+ int relative_start = temp_pos - offset;
+ if(relative_start < 0 || relative_end > donor_sequences[i*2].length() || relative_end > donor_sequences[i*2+1].length()){
+ //dout << "overlapping variants" << endl;
+ return -1;
+ }
+
+ string one_alt = "";
+ string other_alt = "";
+ string var_ref = temp_var.ref;
+ if(temp_phasing == 0){
+ one_alt = temp_var.alts[0];
+ if(temp_var.multi_alts){
+ other_alt = temp_var.alts[1];
+ }else if(temp_var.heterozygous){
+ other_alt = var_ref;
+ }else{
+ other_alt = one_alt;
+ }
+ }else{
+ if(temp_var.multi_alts){
+ one_alt = temp_var.alts[1];
+ }else{
+ one_alt = var_ref;
+ }
+ other_alt = temp_var.alts[0];
+ }
+ string t_sequence = donor_sequences[i*2];
+ string pre_string = t_sequence.substr(0, relative_start);
+ string post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+ donor_sequences[i*2] = pre_string + one_alt + post_string;
+ t_sequence = donor_sequences[i*2+1];
+ pre_string = t_sequence.substr(0, relative_start);
+ post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+ donor_sequences[i*2+1] = pre_string + other_alt + post_string;
+// cout << ":::::::" << endl;
+// cout << subsequence << ", " << offset << endl;
+// PrintVariant(temp_var);
+// cout << relative_start << "," << relative_end << endl;
+// cout << donor_sequences[i*2] << endl;
+// cout << donor_sequences[i*2+1] << endl;
+ }
+// cout << pos_lower_bound[i] << "," << pos_upper_bound[i] << "," ;
+// cout << genome_position[i] << "," << cut_length[i] << endl;
+ }
+
+
+ for(int i = 0; i < 2; i++){
+// cout << "&&&&&" << genome_position[i] << "," << cut_length[i] << endl;
+ if(cut_length[i] < (int)subsequence.length()){
+ donor_sequences[i*2] = donor_sequences[i*2].substr(0, donor_sequences[i*2].length() - cut_length[i]);
+ donor_sequences[i*2+1] = donor_sequences[i*2+1].substr(0, donor_sequences[i*2+1].length() - cut_length[i]);
+ }else{
+ donor_sequences[i*2] = "";
+ donor_sequences[i*2+1] = "";
+ }
+ if(genome_position[i] < 0) genome_position[i] = -1;
+ }
+ selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+// cout << "after apply Selection:" << endl;
+// cout << donor_sequences[0] << endl;
+// cout << donor_sequences[1] << endl;
+// cout << donor_sequences[2] << endl;
+// cout << donor_sequences[3] << endl;
+ bool donor_match = false;
+ if(donor_sequences[0] == donor_sequences[2] && donor_sequences[1] == donor_sequences[3]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }else if(donor_sequences[0] == donor_sequences[3] && donor_sequences[1] == donor_sequences[2]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }
+
+ for(int i = 0; i < 2; i++){
+ selection.genome_position[i] = genome_position[i];
+ selection.donor_length[i] = donor_sequences[i].length();
+ }
+
+ if(! donor_match){
+ if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()) return -1;
+ selection.haplotypes_consistent = false;
+ bool prefix_match = false;
+ if(PrefixMatch(donor_sequences[0], donor_sequences[2]) && PrefixMatch(donor_sequences[1], donor_sequences[3])){
+ prefix_match = true;
+ }else if(PrefixMatch(donor_sequences[0], donor_sequences[3]) && PrefixMatch(donor_sequences[1], donor_sequences[2])){
+ prefix_match = true;
+ }
+ if(prefix_match){
+ return 1;
+ }else{
+ return -1;
+ }
+ }
+
+ if(genome_position[0]!=genome_position[1]) return 1;
+
+ if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()){
+ // achieve whole genome
+ return 3;
+ }
+ // cut only when not reach the end
+ // set min_donor_length
+ // set need_variant = true, because you did not use up all variants
+ return 2;
+}
+
+// code review by Chen on 04/15/2016 and unit test
+// if time consuming, change to the same algorithm as RTG
+int DiploidVCF::CheckDonorSequencesWithOverlap(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ string donor_sequences[]){
+ // if score == 0, do not bother to collapse
+ //if(selection.score == 0) return -1;
+
+ // so here the new donor checking algorithm does not make sense
+
+ // haplotype indicates the haplotype used in D_0
+ // the other haplotype need to calculate
+ // haplotype == -1, all add ref
+ // haplotype == 0, D_0 add alts[0], D_1 add alts[1] if multi_alts, add ref if heterozygous, add alts[0] otherwise
+ // haplotype == 1, D_0 add alts[1] if multi_alts, add ref otherwise, D_1 add alts[0]
+
+ // first, decide substr of genome sequence that be applied
+ // genome sequence that is
+ int genome_position[2] = {-1, -1};
+ int cut_length[2] = {-1, -1};
+ int pos_lower_bound[2] = {-1, -1}; // exclusive
+ int pos_upper_bound[2] = {-1, -1}; // exclusive
+
+ int variant_num[2];
+ // do not calculate lower bound
+ for(int i = 0; i < 2; i++){
+ variant_num[i] = (int)selection.phasing_vectors[i].size();
+
+ if(variant_num[i] == 0){
+ pos_lower_bound[i] = -1;
+ }else{
+ DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+ pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+ }
+
+ if(variant_num[i] < separate_var_list[i].size()){
+ pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+ }else{
+ if(selection.separate_score[i] == 0){
+ return -1;
+ }
+ pos_upper_bound[i] = (int)subsequence.length();
+ }
+ }
+
+
+
+ // here first decide reference sequence for apply
+ for(int i = 0; i < 2; i++){
+ donor_sequences[i*2] = subsequence;
+ donor_sequences[i*2+1] = subsequence;
+ }
+
+
+ for(int i = 0; i < 2; i++){
+ DiploidVariant pre_var;
+ for(int k = (int)selection.phasing_vectors[i].size() - 1; k >= 0; k--){
+ int temp_phasing = selection.phasing_vectors[i][k];
+ if(temp_phasing == -1){
+ continue;
+ }
+ DiploidVariant temp_var = separate_var_list[i][k];
+ if(temp_var.pos = pre_var.pos && temp_var.ref == pre_var.ref) return -1; // can not change the same sequence twice
+ int temp_pos = temp_var.pos;
+ int temp_end = temp_pos + temp_var.ref.length();
+
+ pos_lower_bound[i] = max(pos_lower_bound[i], temp_end);
+
+ int relative_end = temp_end - offset;
+ int relative_start = temp_pos - offset;
+ if(relative_start < 0 || relative_end > donor_sequences[i*2].length() || relative_end > donor_sequences[i*2+1].length()){
+ //dout << "overlapping variants" << endl;
+ return -1;
+ }
+
+ string one_alt = "";
+ string other_alt = "";
+ string var_ref = temp_var.ref;
+ if(temp_phasing == 0){
+ one_alt = temp_var.alts[0];
+ if(temp_var.multi_alts){
+ other_alt = temp_var.alts[1];
+ }else if(temp_var.heterozygous){
+ other_alt = var_ref;
+ }else{
+ other_alt = one_alt;
+ }
+ }else{
+ if(temp_var.multi_alts){
+ one_alt = temp_var.alts[1];
+ }else{
+ one_alt = var_ref;
+ }
+ other_alt = temp_var.alts[0];
+ }
+ string t_sequence = donor_sequences[i*2];
+ string pre_string = t_sequence.substr(0, relative_start);
+ string post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+ donor_sequences[i*2] = pre_string + one_alt + post_string;
+ t_sequence = donor_sequences[i*2+1];
+ pre_string = t_sequence.substr(0, relative_start);
+ post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+ donor_sequences[i*2+1] = pre_string + other_alt + post_string;
+ pre_var = temp_var;
+ }
+// cout << pos_lower_bound[i] << "," << pos_upper_bound[i] << "," ;
+// cout << genome_position[i] << "," << cut_length[i] << endl;
+ }
+
+ if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+ genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+ genome_position[1] = genome_position[0];
+ }else{
+ genome_position[0] = pos_upper_bound[0];
+ genome_position[1] = pos_upper_bound[1];
+ }
+
+ cut_length[0] = subsequence.length() - genome_position[0];
+ cut_length[1] = subsequence.length() - genome_position[1];
+
+ for(int i = 0; i < 2; i++){
+// cout << "&&&&&" << genome_position[i] << "," << cut_length[i] << endl;
+ if(cut_length[i] < (int)subsequence.length()){
+ donor_sequences[i*2] = donor_sequences[i*2].substr(0, donor_sequences[i*2].length() - cut_length[i]);
+ donor_sequences[i*2+1] = donor_sequences[i*2+1].substr(0, donor_sequences[i*2+1].length() - cut_length[i]);
+ }else{
+ donor_sequences[i*2] = "";
+ donor_sequences[i*2+1] = "";
+ }
+ if(genome_position[i] < 0) genome_position[i] = -1;
+ }
+ selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+// cout << "after apply Selection:" << endl;
+// cout << donor_sequences[0] << endl;
+// cout << donor_sequences[1] << endl;
+// cout << donor_sequences[2] << endl;
+// cout << donor_sequences[3] << endl;
+ bool donor_match = false;
+ if(donor_sequences[0] == donor_sequences[2] && donor_sequences[1] == donor_sequences[3]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }else if(donor_sequences[0] == donor_sequences[3] && donor_sequences[1] == donor_sequences[2]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }
+
+ for(int i = 0; i < 2; i++){
+ selection.genome_position[i] = genome_position[i];
+ selection.donor_length[i] = donor_sequences[i].length();
+ }
+
+ if(! donor_match){
+ if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()) return -1;
+ selection.haplotypes_consistent = false;
+ bool prefix_match = false;
+ if(PrefixMatch(donor_sequences[0], donor_sequences[2]) && PrefixMatch(donor_sequences[1], donor_sequences[3])){
+ prefix_match = true;
+ }else if(PrefixMatch(donor_sequences[0], donor_sequences[3]) && PrefixMatch(donor_sequences[1], donor_sequences[2])){
+ prefix_match = true;
+ }
+ if(prefix_match){
+ return 1;
+ }else{
+ return -1;
+ }
+ }
+
+ if(genome_position[0]!=genome_position[1]) return 1;
+
+ if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()){
+ // achieve whole genome
+ return 3;
+ }
+ // cut only when not reach the end
+ // set min_donor_length
+ // set need_variant = true, because you did not use up all variants
+ return 2;
+}
+
+int DiploidVCF::ExtendingDonorSequences(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ int flag){
+ int genome_position[2] = {0, 0};
+ int pos_lower_bound[2] = {0, 0}; // exclusive
+ int pos_upper_bound[2] = {0, 0}; // exclusive
+
+ int variant_num[2];
+ bool consider_all_variants = true;
+ for(int i = 0; i < 2; i++){
+ variant_num[i] = (int)selection.phasing_vectors[i].size();
+
+ if(variant_num[i] == 0){
+ pos_lower_bound[i] = 0;
+ }else{
+ DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+ pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+ }
+
+ if(variant_num[i] < separate_var_list[i].size()){
+ consider_all_variants = false;
+ pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+ }else{
+ if(selection.separate_score[i] == 0){
+ return -1;
+ }
+ pos_upper_bound[i] = (int)subsequence.length();
+ }
+ //if(pos_upper_bound[i] < pos_lower_bound[i]) pos_upper_bound[i] = pos_lower_bound[i];
+// dout << i << " lower bound:" << pos_lower_bound[i] << endl;
+// dout << i << " upper bound:" << pos_upper_bound[i] << endl;
+ }
+
+ if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+ genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+ genome_position[1] = genome_position[0];
+ }else{
+ genome_position[0] = pos_upper_bound[0];
+ genome_position[1] = pos_upper_bound[1];
+ }
+
+ for(int i = 0; i < 2; i++){
+ // also consider overlap variants here
+ int pre_start = selection.genome_position[i];
+ if(i!=flag){
+ if(pre_start == genome_position[i]) continue;
+
+ if(pre_start > genome_position[i]){
+ int cut_len = pre_start - genome_position[i];
+ selection.donor_sequences[i*2] = selection.donor_sequences[i*2].substr(0, selection.donor_sequences[i*2].length()-cut_len);
+ selection.donor_sequences[i*2+1] = selection.donor_sequences[i*2+1].substr(0, selection.donor_sequences[i*2+1].length()-cut_len);
+ }else{
+ string post_s = subsequence.substr(pre_start, genome_position[i]-pre_start);
+ selection.donor_sequences[i*2] += post_s;
+ selection.donor_sequences[i*2+1] += post_s;
+ }
+ selection.genome_position[i] = genome_position[i];
+ }else{
+ int last_i = variant_num[i]-1;
+ DiploidVariant last_v = separate_var_list[i][last_i];
+ int last_phase = selection.phasing_vectors[i][last_i];
+ int pre_end = last_v.pos - offset;
+ int post_start = pre_end + last_v.ref.length();
+ if(pre_end < pre_start){
+ dout << "error when extend donor sequence" << endl;
+ return -1;
+ }
+
+ int post_end = genome_position[i];
+ if(post_end < post_start){
+ selection.overlap_detected = true;
+ genome_position[i] = post_start;
+ post_end = post_start;
+ }
+
+ string var_ref = last_v.ref;
+ string one_alt = var_ref;
+ string other_alt = var_ref;
+ if(last_phase == 0){
+ one_alt = last_v.alts[0];
+ if(last_v.multi_alts){
+ other_alt = last_v.alts[1];
+ }else if(!last_v.heterozygous){
+ other_alt = one_alt;
+ }
+ }else if(last_phase == 1){
+ if(last_v.multi_alts){
+ one_alt = last_v.alts[1];
+ }
+ other_alt = last_v.alts[0];
+ }
+
+ string pre_string = subsequence.substr(pre_start, pre_end-pre_start);
+ string post_string = subsequence.substr(post_start, post_end - post_start);
+ selection.donor_sequences[i*2] += pre_string + one_alt + post_string;
+ selection.donor_sequences[i*2+1] += pre_string + other_alt + post_string;
+
+ selection.genome_position[i] = genome_position[i];
+ }
+ }
+
+ bool same_genome_position = false;
+ if(genome_position[0]==genome_position[1]) same_genome_position = true;
+
+ if(same_genome_position){
+ selection.min_genome_pos = genome_position[0];
+ }else{
+ selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+ }
+
+ for(int i = 0; i < 2; i++){
+ selection.donor_length[i] = selection.donor_sequences[i].length();
+ }
+
+ bool donor_match = false;
+
+ if(same_genome_position){
+ if(selection.donor_sequences[0] == selection.donor_sequences[2] && selection.donor_sequences[1] == selection.donor_sequences[3]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }
+ else if(selection.donor_sequences[0] == selection.donor_sequences[3] && selection.donor_sequences[1] == selection.donor_sequences[2]){
+ donor_match = true;
+ selection.haplotypes_consistent = true;
+ }
+ }
+
+ // matching prefix is actually not necessary, we can postpone until we get the same sequence length
+ if(! donor_match){
+ if(consider_all_variants) return -1;
+
+ selection.haplotypes_consistent = false;
+ bool prefix_match = false;
+ if(PrefixMatch(selection.donor_sequences[0], selection.donor_sequences[2]) && PrefixMatch(selection.donor_sequences[1], selection.donor_sequences[3])){
+ prefix_match = true;
+ }
+ else if(PrefixMatch(selection.donor_sequences[0], selection.donor_sequences[3]) && PrefixMatch(selection.donor_sequences[1], selection.donor_sequences[2])){
+ prefix_match = true;
+ }
+ if(prefix_match){
+// if(same_genome_position){
+// return 4;
+// }
+ return 1;
+ }else{
+ return -1;
+ }
+ }
+
+ if(consider_all_variants){
+ return 3;
+ }
+
+ return 2;
+}
+
+// code review by Chen on 04/15/2016
+// [TODO] unit test
+// selection should pass by value
+// return if insert or not
+bool DiploidVCF::AddVariantToSelection(list<VariantSelection> & variant_selections,
+ VariantSelection selection,
+ DiploidVariant variant,
+ int haplotype,
+ vector<DiploidVariant> separate_var_list[],
+ const string & subsequence,
+ int offset,
+ VariantSelection & best_selection){
+ // create a new variant by adding variant and haplotype into selection
+ // call this function because new variants are add in but not evaluate
+// cout << "add variant ";
+// PrintVariant(variant);
+// cout << "with haplotype: " << haplotype ;
+// cout << "into selection" ;
+// PrintSelection(selection);
+
+ int flag = variant.flag;
+ int variant_pos = variant.pos;
+ selection.pos_vectors[flag].push_back(variant_pos);
+ selection.phasing_vectors[flag].push_back(haplotype);
+
+
+ // $ did not add this function to VariantSelection to reduce memory usage
+ // set selection.need_variant = false, add it directly into list
+ if(haplotype != -1){
+ selection.score++;
+ selection.separate_score[flag] ++;
+ }else{
+ flag = -1;
+ }
+ // insert in the order of min donor length
+ int consistent_state = 0;
+ //check overlap
+
+ if(selection.overlap_detected){
+ //naive way of checking overlaps
+// for(int i = 0; i < 2; i++){
+// int largest_pos = 0;
+// DiploidVariant largest_var;
+// for(int k = 0; k < selection.phasing_vectors[i].size(); k++){
+// int phasing = selection.phasing_vectors[i][k];
+// if(phasing == -1) continue;
+// DiploidVariant var = separate_var_list[i][k];
+// int var_end = var.pos+var.ref.length();
+// if(var.pos < largest_pos-3){
+// // two conditions
+// if(var.mdl != 0 || var.mil != 0){
+// if(largest_var.mdl != 0 || largest_var.mil != 0){
+// return false;
+// }
+// }
+// //if(var.pos = largest_var.pos) return false;
+// }
+// if(largest_pos < var_end){
+// largest_pos = var_end;
+// largest_var = var;
+// }
+// }
+// }
+
+ string donor_sequences[4];
+ consistent_state = CheckDonorSequences(separate_var_list,
+ selection,
+ subsequence,
+ offset,
+ donor_sequences);
+ for(int i = 0; i < 4; i++)
+ selection.donor_sequences[i] = donor_sequences[i];
+ }else{
+ consistent_state = ExtendingDonorSequences(separate_var_list,
+ selection,
+ subsequence,
+ offset,
+ flag);
+ }
+
+ //PrintSelection(selection);
+
+ // there are 4 state:
+ // 0. not match and not prefix match, do not add, return -1
+ // 1. not match but prefix match, just add, return 1
+ // 2. match but not reach end, merge paths, all paths in list need variant, return 2
+ // 3. match and reach end, compare with best match, return 3
+
+ if(consistent_state <= 0) return false;
+
+ if(consistent_state == 1){
+// cout << "==> prefix match: " << endl;
+// cout << donor_sequences[0] << endl;
+// cout << donor_sequences[1] << endl;
+// cout << donor_sequences[2] << endl;
+// cout << donor_sequences[3] << endl;
+// bool inserted = false;
+// for(auto it = variant_selections.begin(); it != variant_selections.end(); ++it){
+// if(it->min_genome_pos > selection.min_genome_pos){
+// variant_selections.insert(it, selection);
+// inserted = true;
+// break;
+// }
+// }
+// if(!inserted){ // did not find a proper position to insert
+// variant_selections.push_back(selection);
+// }
+ auto it = upper_bound(variant_selections.begin(), variant_selections.end(), selection);
+ variant_selections.insert(it, selection);
+ return true;
+ }
+
+// if(consistent_state == 4){
+// return CollapsePrefixMatchSelection(selection, variant_selections);
+// }
+
+ if(consistent_state == 2){
+
+// cout << "==> report match: " << endl;
+// cout << donor_sequences[0] << endl;
+// cout << donor_sequences[1] << endl;
+// cout << donor_sequences[2] << endl;
+// cout << donor_sequences[3] << endl;
+ return CollapseSelections(selection, // you can only collapse one selection at a time
+ variant_selections);
+
+ }
+
+ if(consistent_state == 3){
+// cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" << endl;
+// cout << donor_sequences[0] << endl;
+// cout << donor_sequences[1] << endl;
+// cout << donor_sequences[2] << endl;
+// cout << donor_sequences[3] << endl;
+ if(selection.score > best_selection.score){
+ best_selection = selection;
+ }
+ return false;
+ }
+ return false;
+}
+
+bool DiploidVCF::CollapsePrefixMatchSelection(VariantSelection selection,
+ list<VariantSelection> & variant_selections){
+ bool need_insert = false;
+ for(auto it = variant_selections.begin(); it != variant_selections.end(); ){
+ if(need_insert){
+ variant_selections.insert(it, selection);
+ return true;
+ }
+ VariantSelection ts = *it;
+ if(ts.min_genome_pos > selection.min_genome_pos){
+ variant_selections.insert(it, selection);
+ return true;
+ }else if(ts.min_genome_pos == selection.min_genome_pos &&
+ ts.genome_position[0] == ts.genome_position[1] && // also same genome position
+ ts.donor_sequences[0] == selection.donor_sequences[0] &&
+ ts.donor_sequences[1] == selection.donor_sequences[1] &&
+ ts.donor_sequences[2] == selection.donor_sequences[2] &&
+ ts.donor_sequences[3] == selection.donor_sequences[3] )
+ {
+ if(ts.score < selection.score){
+ it = variant_selections.erase(it);
+ need_insert = true;
+ continue;
+ }else{
+ return false;
+ }
+ }else{
+ ++it;
+ }
+ }
+ variant_selections.push_back(selection); // finally we need to insert
+ return true;
+}
+
+// code review by Chen on 04/15/2016, unit test
+bool DiploidVCF::CollapseSelections(VariantSelection selection,
+ list<VariantSelection> & variant_selections){
+// bool need_insert = false;
+// for(auto it = variant_selections.begin(); it != variant_selections.end(); ){
+// if(need_insert){
+// variant_selections.insert(it, selection);
+// return true;
+// }
+// VariantSelection ts = *it;
+// if(ts.min_genome_pos > selection.min_genome_pos){
+// variant_selections.insert(it, selection);
+// return true;
+// }else if(ts.haplotypes_consistent &&
+// ts.genome_position[0] == selection.genome_position[0] &&
+// ts.genome_position[1] == selection.genome_position[1] &&
+// ( (ts.donor_length[0] == selection.donor_length[0] && ts.donor_length[1] == selection.donor_length[1]) ||
+// (ts.donor_length[1] == selection.donor_length[0] && ts.donor_length[0] == selection.donor_length[1]) ) ){
+// if(ts.score < selection.score){
+// it = variant_selections.erase(it);
+// need_insert = true;
+// continue;
+// }else{
+// return false;
+// }
+// }else{
+// ++it;
+// }
+// }
+// variant_selections.push_back(selection);
+// return true;
+ auto lt = lower_bound(variant_selections.begin(), variant_selections.end(), selection);
+ auto rt = upper_bound(lt, variant_selections.end(), selection);
+ // lower bound is ret.first
+ // upper bound is ret.second
+
+ if(lt == variant_selections.end() || lt->min_genome_pos != selection.min_genome_pos){
+ variant_selections.insert(rt, selection);
+ return true;
+ }else{
+ for(auto it = lt; it!= rt;){
+ VariantSelection ts = *it;//ts represents each selection in variant_selections
+ if(ts.haplotypes_consistent &&
+ ts.genome_position[0] == selection.genome_position[0] &&
+ ts.genome_position[1] == selection.genome_position[1] &&
+ ( (ts.donor_length[0] == selection.donor_length[0] && ts.donor_length[1] == selection.donor_length[1]) ||
+ (ts.donor_length[1] == selection.donor_length[0] && ts.donor_length[0] == selection.donor_length[1]) ) )
+ {
+ if(ts.score < selection.score){
+ it = variant_selections.erase(it);
+ variant_selections.insert(it, selection);
+ return true;
+ }else{
+ return false;
+ }
+ }else{
+ ++it;
+ }
+ }
+
+ // here, iterate all candidates, not found match, directly insert
+ variant_selections.insert(rt, selection);
+ return true;
+ }
+
+}
+
+// code reviewed by Chen on 04/15/2016
+// [TODO] unit test
+
+bool DiploidVCF::AcceleratedVariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+ if(variant_list.size() <= 1) return false;
+ sort(variant_list.begin(), variant_list.end()); // here we need to sort
+ vector<DiploidVariant> separate_var_list[2];
+ // separate into ref and que
+ int total_mil = 0;
+ int total_mdl = 0;
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ total_mil += variant_list[i].mil;
+ total_mdl += variant_list[i].mdl;
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+ complex_ref_match_num[thread_index]++;
+ complex_que_match_num[thread_index]++;
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ complex_match_records[thread_index]->push_back(match_record);
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+ int flag = 0;
+ if(separate_var_list[1].size() == 1) flag = 1;
+ int r_flag = 1-flag;
+ if(separate_var_list[r_flag].size() > 4){
+ int total_r_mdl = 0;
+ int total_r_mil = 0;
+
+ for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+ DiploidVariant var = separate_var_list[r_flag][k];
+ int var_mdl = var.mdl;
+ int var_mil = var.mil;
+ int ref_length = var.ref.length();
+ total_r_mdl += var_mdl;
+ total_r_mil += var_mil;
+ }
+
+ if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+ }
+ }
+
+ // remove singular variant
+ vector<bool> appliable_flag[2];
+ int total_change = total_mil+total_mdl;
+ for(int i = 0; i < 2; i++){
+ for(int k = 0; k < separate_var_list[i].size(); k++){
+ DiploidVariant cur_var = separate_var_list[i][k];
+ int max_change = max(cur_var.mil, cur_var.mdl);
+ if(max_change > total_change-max_change){
+ appliable_flag[i].push_back(false);
+ }else{
+ appliable_flag[i].push_back(true);
+ }
+ }
+ }
+
+ string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+ list<VariantSelection> variant_selections; // sorted by last matched donor length
+ VariantSelection best_selection;
+ VariantSelection dummy;
+
+ bool overlap_detected = false;
+
+ for(int i = 0; i < 2; i++){
+ int largest_pos = 0;
+ for(int k = 0; k < separate_var_list[i].size(); k++){
+ auto var = separate_var_list[i][k];
+ if(var.pos <= largest_pos){
+ overlap_detected = true;
+ break;
+ }
+ largest_pos = max(largest_pos, (int)(var.pos+var.ref.length()));
+ }
+ if(overlap_detected) break;
+ }
+ dummy.overlap_detected = overlap_detected;
+
+ variant_selections.push_back(dummy);
+
+ map<string, int> score_by_consistent_donor; // donor should be sorted
+
+ while(variant_selections.size() != 0){
+ VariantSelection current_selection = variant_selections.front();
+ variant_selections.pop_front();
+
+ bool get_ref_var = true;
+ int ref_var_taken = current_selection.phasing_vectors[0].size();
+ int que_var_taken = current_selection.phasing_vectors[1].size();
+ if(ref_var_taken >= separate_var_list[0].size()){
+ get_ref_var = false;
+ }else if(que_var_taken < separate_var_list[1].size()){
+ if(current_selection.genome_position[0] > current_selection.genome_position[1]){
+ get_ref_var = false;
+ }else if( current_selection.genome_position[0] == current_selection.genome_position[1]){
+ if(min(current_selection.donor_length[0], current_selection.donor_length[1]) > min(current_selection.donor_length[2], current_selection.donor_length[3])){
+ get_ref_var = false;
+ }
+ }
+ }
+
+ DiploidVariant current_variant;
+ bool can_take_variant = true;
+ if(get_ref_var){
+ can_take_variant = appliable_flag[0][ref_var_taken];
+ current_variant = separate_var_list[0][ref_var_taken];
+ }else{
+ can_take_variant = appliable_flag[1][que_var_taken];
+ current_variant = separate_var_list[1][que_var_taken];
+ }
+
+ int current_flag = current_variant.flag;
+
+// cout << "current selection" << endl;
+// PrintSelection(current_selection);
+// cout << "add variant";
+// PrintVariant(current_variant);
+
+ bool added = false;
+ // make choose decision before not choose decision, save del times
+ if(can_take_variant){
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 0,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+ // cout << "added state : " << added << endl;
+ // PrintSelectionsList(variant_selections);
+
+ if(current_variant.heterozygous){
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+ // cout << "added state : " << added << endl;
+ // PrintSelectionsList(variant_selections);
+ }
+ }
+
+ added= AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ -1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+
+ }
+// dout << best_selection.score << endl;
+ if (best_selection.score <= 0) return false;
+// cout << "best selection: " << endl;
+// PrintSelection(best_selection);
+ complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+ complex_que_match_num[thread_index] += best_selection.separate_score[1];
+
+ bool multiple_match = true;
+ if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+// string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+// if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_selection.donor_sequences[0];
+ string parsimonious_alt1 = best_selection.donor_sequences[1];
+
+ int parsimonious_pos = NormalizeVariantSequence(offset,
+ parsimonious_ref,
+ parsimonious_alt0,
+ parsimonious_alt1);
+
+ string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ auto final_iter = separate_var_list[i].size()-1;
+ vector<int> phasing_vector = best_selection.phasing_vectors[i];
+ for (int k = 0; k < separate_var_list[i].size(); k++) {
+ int phasing = phasing_vector[k];
+ if(phasing == -1) continue;
+ DiploidVariant variant = separate_var_list[i][k];
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts){
+ phasing_string += "2|1";
+ }else{
+ phasing_string += "0|1";
+ }
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ if (k != final_iter) {
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ }
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_selection.score) + "\n";
+
+ complex_match_records[thread_index]->push_back(match_record);
+ // add matching result
+
+ return true;
+}
+
+bool DiploidVCF::VariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+ if(variant_list.size() <= 1) return false;
+ sort(variant_list.begin(), variant_list.end()); // here we need to sort
+ vector<DiploidVariant> separate_var_list[2];
+ // separate into ref and que
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+ complex_ref_match_num[thread_index]++;
+ complex_que_match_num[thread_index]++;
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = to_string(tv.pos) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ complex_match_records[thread_index]->push_back(match_record);
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }
+ string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+ list<VariantSelection> variant_selections; // sorted by last matched donor length
+ VariantSelection best_selection;
+ VariantSelection dummy;
+ variant_selections.push_back(dummy);
+ map<string, int> score_by_consistent_donor; // donor should be sorted
+
+ while(variant_selections.size() != 0){
+ VariantSelection current_selection = variant_selections.front();
+ variant_selections.pop_front();
+
+ // all variants has been evaluated, need new variant
+ int previous_var_index = current_selection.cur_var;
+ if(previous_var_index < (int)variant_list.size()-1){
+ int cur_var_index = previous_var_index + 1;
+// cout << "consider variant: " << cur_var_index << endl;
+ DiploidVariant current_variant = variant_list[cur_var_index];
+ // update boundary of current_selection
+ current_selection.cur_var = cur_var_index;
+ int current_flag = current_variant.flag;
+
+// cout << "current selection" << endl;
+// PrintSelection(current_selection);
+// cout << "add variant";
+// PrintVariant(current_variant);
+
+ bool added = false;
+ // make choose decision before not choose decision, save del times
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 0,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+
+ if(current_variant.heterozygous){
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+ }
+
+ added= AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ -1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+ }
+ }
+// dout << best_selection.score << endl;
+ if (best_selection.score <= 0) return false;
+// cout << "best selection: " << endl;
+// PrintSelection(best_selection);
+ complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+ complex_que_match_num[thread_index] += best_selection.separate_score[1];
+
+ bool multiple_match = true;
+ if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+ string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+ if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ auto final_iter = separate_var_list[i].size()-1;
+ vector<int> phasing_vector = best_selection.phasing_vectors[i];
+ for (int k = 0; k < separate_var_list[i].size(); k++) {
+ int phasing = phasing_vector[k];
+ if(phasing == -1) continue;
+ DiploidVariant variant = separate_var_list[i][k];
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts){
+ phasing_string += "2|1";
+ }else{
+ phasing_string += "0|1";
+ }
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ if (k != final_iter) {
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ }
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_selection.score) + "\n";
+
+ complex_match_records[thread_index]->push_back(match_record);
+ // add matching result
+ return true;
+}
+
+bool DiploidVCF::VariantMatchPathCreationByDonor(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+ if(variant_list.size() <= 1) return false;
+ sort(variant_list.begin(), variant_list.end()); // here we need to sort
+ vector<DiploidVariant> separate_var_list[2];
+ // separate into ref and que
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+ complex_ref_match_num[thread_index]++;
+ complex_que_match_num[thread_index]++;
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = to_string(tv.pos) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ complex_match_records[thread_index]->push_back(match_record);
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+ int flag = 0;
+ if(separate_var_list[1].size() == 1) flag = 1;
+ int r_flag = 1-flag;
+ if(separate_var_list[r_flag].size() > 4){
+ int total_r_mdl = 0;
+ int total_r_mil = 0;
+
+ for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+ DiploidVariant var = separate_var_list[r_flag][k];
+ int var_mdl = var.mdl;
+ int var_mil = var.mil;
+ int ref_length = var.ref.length();
+ total_r_mdl += var_mdl;
+ total_r_mil += var_mil;
+ }
+ if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+ }
+ }
+ string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+ list<VariantSelection> variant_selections; // sorted by last matched donor length
+ VariantSelection best_selection;
+
+ bool overlap_detected = false;
+
+ for(int i = 0; i < 2; i++){
+ int largest_pos = 0;
+ for(int k = 0; k < separate_var_list[i].size(); k++){
+ auto var = separate_var_list[i][k];
+ if(var.pos < largest_pos && var.pos+var.ref.length() > largest_pos){
+ overlap_detected = true;
+ break;
+ }
+ largest_pos = max(largest_pos, (int)(var.pos+var.ref.length()));
+ }
+ if(overlap_detected) break;
+ }
+
+
+ VariantSelection dummy;
+ dummy.overlap_detected = overlap_detected;
+
+ variant_selections.push_back(dummy);
+ map<string, int> score_by_consistent_donor; // donor should be sorted
+
+ while(variant_selections.size() != 0){
+ VariantSelection current_selection = variant_selections.front();
+ variant_selections.pop_front();
+ // all variants has been evaluated, need new variant
+ int previous_var_index = current_selection.cur_var;
+ if(previous_var_index < (int)variant_list.size()-1){
+
+ bool choose_ref = true;
+ int min_ref_donor = min(current_selection.donor_sequences[0].length(), current_selection.donor_sequences[1].length());
+ int min_que_donor = min(current_selection.donor_sequences[2].length(), current_selection.donor_sequences[3].length());
+ if(min_ref_donor > min_que_donor && current_selection.phasing_vectors[1].size() < separate_var_list[1].size()){
+ choose_ref = false;
+ }
+ if(current_selection.phasing_vectors[0].size() >= separate_var_list[0].size()){
+ choose_ref = false;
+ }
+ DiploidVariant current_variant;
+ if(choose_ref){
+ current_variant = separate_var_list[0][current_selection.phasing_vectors[0].size()];
+ }else{
+ current_variant = separate_var_list[1][current_selection.phasing_vectors[1].size()];
+ }
+
+ current_selection.cur_var++;
+ int current_flag = current_variant.flag;
+
+// cout << "current selection" << endl;
+// PrintSelection(current_selection);
+// cout << "add variant";
+// PrintVariant(current_variant);
+
+ bool added = false;
+ // make choose decision before not choose decision, save del times
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 0,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+
+ if(current_variant.heterozygous){
+ added = AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ 1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+ }
+
+ added= AddVariantToSelection(variant_selections,
+ current_selection,
+ current_variant,
+ -1,
+ separate_var_list,
+ subsequence,
+ offset,
+ best_selection);
+// cout << "added state : " << added << endl;
+// PrintSelectionsList(variant_selections);
+ }
+ }
+// dout << best_selection.score << endl;
+ if (best_selection.score <= 0) return false;
+// cout << "best selection: " << endl;
+// PrintSelection(best_selection);
+ complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+ complex_que_match_num[thread_index] += best_selection.separate_score[1];
+
+ bool multiple_match = true;
+ if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+ string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+ if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ auto final_iter = separate_var_list[i].size()-1;
+ vector<int> phasing_vector = best_selection.phasing_vectors[i];
+ for (int k = 0; k < separate_var_list[i].size(); k++) {
+ int phasing = phasing_vector[k];
+ if(phasing == -1) continue;
+ DiploidVariant variant = separate_var_list[i][k];
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts){
+ phasing_string += "2|1";
+ }else{
+ phasing_string += "0|1";
+ }
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ if (k != final_iter) {
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ }
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_selection.score) + "\n";
+
+ complex_match_records[thread_index]->push_back(match_record);
+ // add matching result
+ return true;
+}
+
+//
+// code reviewed by Chen on 4/4/2016
+bool DiploidVCF::VariantMatchWithOverlap(vector<DiploidVariant> & variant_list, int thread_index) {
+ if(variant_list.size() <= 1) return false;
+ sort(variant_list.begin(), variant_list.end());
+ map<int, DiploidVariant> separate_pos_var[2];
+ // separate into ref and que
+ int min_pos = genome_sequence.length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_pos_var[flag][pos] = variant_list[i];
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+ }
+
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length());
+
+ if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+ return false;
+ }
+
+ string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+ int offset = min_pos;
+ map<int, int> selected_positions[2];
+ FindBestMatchWithOverlap(variant_list,
+ subsequence,
+ offset,
+ 0,
+ separate_pos_var,
+ selected_positions);
+
+ if (selected_positions[0].size() == 0 || selected_positions[1].size() == 0) {
+ return false;
+ }
+
+ complex_ref_match_num[thread_index] += selected_positions[0].size();
+ complex_que_match_num[thread_index] += selected_positions[1].size();
+
+ return true;
+}
+
+//
+bool DiploidVCF::FindBestMatchWithOverlap(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ map<int, int> selected_positions[])
+{
+ //set<int> selected_positions[2];
+ vector<int> positions[2]; // 0 from ref, 1 from query
+ vector<bool> indicators[2]; // 0 from ref, 1 from query, indicate if multi_alts(true) or not(false)
+ for (int i = 0; i < 2; i++) {
+ for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+ auto v = it->second;
+ positions[i].push_back(v.pos);
+ indicators[i].push_back(v.multi_alts);
+ }
+ }
+ // construct ref combinations in hash table, key is donor sequence
+ unordered_map<string, vector<vector<int>> > seq_choice_ref;
+ unordered_map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+ for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+ vector<vector<vector<int>>> ref_choice_list = Combine(positions[0], indicators[0], i);
+
+ for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+ // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+ string donor;
+ int score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], *rit, donor, score);
+ if(CompareSequence(donor, subsequence)) continue;
+ if(seq_choice_ref.find(donor) != seq_choice_ref.end() && seq_score_ref[donor] > score){
+ continue;
+ }else{
+ // either overwrite or insert new
+ seq_choice_ref[donor] = *rit;
+ seq_score_ref[donor] = score;
+ }
+ //dout << "ref-donor: " << donor << endl;
+ }
+ }
+ // now all combinations are stored in hash table seq_choice_ref
+ // search query
+ for(int i = 1; i <= positions[1].size(); i++){
+ // iterate all combinations with i variants
+ vector<vector<vector<int>>> que_choice_list = Combine(positions[1], indicators[1], i);
+ for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+ string donor;
+ int score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], *qit, donor, score);
+ if(CompareSequence(donor, subsequence)) continue;
+
+ if(seq_choice_ref.find(donor) != seq_choice_ref.end()){
+ // first check if there is heterozygous alleles
+ int total_score = seq_score_ref[donor] + score;
+ if (total_score <= 0) continue;
+
+ // this time we don't find max, but all, and put them in a set
+ //if(total_score <= max_score) continue;
+
+ bool local_heter = false;
+ bool local_multi = false;
+ vector<vector<int>> ref_var_choices = seq_choice_ref[donor];
+ vector<vector<int>> que_var_choices = *qit;
+
+ if(! match_genotype){
+ for(int k = 0; k < ref_var_choices.size(); k++){
+ if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+ selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+ }
+ }
+ for(int k = 0; k < que_var_choices.size(); k++){
+ if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+ selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+ }
+ }
+ continue;
+ }
+
+ vector<vector<int>> ref_other_choices;
+ vector<vector<int>> que_other_choices;
+ // check and construct heterozygous alleles
+ for(int ri = 0; ri < ref_var_choices.size(); ri++){
+ int ref_pos = ref_var_choices[ri][0];
+ DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+ if (ref_variant.multi_alts){
+ local_multi = true;
+ ref_other_choices.push_back(vector<int>({ref_pos, 1 - ref_var_choices[ri][1]}));
+ }else if(ref_variant.heterozygous){
+ local_heter = true;
+ ref_other_choices.push_back(vector<int>({ref_pos,-1}));
+ }else{
+ ref_other_choices.push_back(vector<int>({ref_pos, ref_var_choices[ri][1]}));
+ }
+ }
+ // if not find heter, continue checking
+ for(int qi = 0; qi < que_var_choices.size(); qi++){
+ int que_pos = que_var_choices[qi][0];
+ DiploidVariant que_variant = separate_pos_var[1][que_pos];
+ if(que_variant.multi_alts){
+ local_multi = true;
+ que_other_choices.push_back(vector<int>({que_pos, 1- que_var_choices[qi][1]}));
+ }else if (que_variant.heterozygous){
+ local_heter = true;
+ que_other_choices.push_back(vector<int>({que_pos, -1}));
+ }else{
+ que_other_choices.push_back(vector<int>({que_pos, que_var_choices[qi][1]}));
+ }
+ }
+
+
+ if(local_multi){
+ // also check the other chromosome matches
+ int temp_score;
+ string ref_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+ string que_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+ if(CompareSequence(ref_other_donor, que_other_donor)){
+
+ for(int k = 0; k < ref_var_choices.size(); k++){
+ if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+ selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+ }
+ }
+ for(int k = 0; k < que_var_choices.size(); k++){
+ if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+ selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+ }
+ }
+ }
+ }else if(local_heter){
+ // also check the other chromosome matches
+ int temp_score;
+ string ref_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+ string que_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+ if(CompareSequence(ref_other_donor, que_other_donor)){
+
+ for(int k = 0; k < ref_var_choices.size(); k++){
+ if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+ selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+ }
+ }
+ for(int k = 0; k < que_var_choices.size(); k++){
+ if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+ selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+ }
+ }
+ }
+ }else{
+
+ for(int k = 0; k < ref_var_choices.size(); k++){
+ if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+ selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+ }
+ }
+ for(int k = 0; k < que_var_choices.size(); k++){
+ if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+ selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+ }
+ }
+ //delay construct optimal solution at the very end.
+ }
+ }
+ }
+ }
+ if(selected_positions[0].size() > 0 && selected_positions[1].size() > 0){
+ vector<vector<int>> ref_set_choices;
+ vector<vector<int>> que_set_choices;
+ for(auto it = selected_positions[0].begin(); it != selected_positions[0].end(); ++it){
+ ref_set_choices.push_back(vector<int>({it->first, it->second}));
+ }
+ for(auto it = selected_positions[1].begin(); it != selected_positions[1].end(); ++it){
+ que_set_choices.push_back(vector<int>({it->first, it->second}));
+ }
+ int temp_score;
+ string ref_set_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_set_choices, ref_set_donor, temp_score);
+ string que_set_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_set_choices, que_set_donor, temp_score);
+ if(!CompareSequence(ref_set_donor, que_set_donor)){
+ cout << "Overlap matching does not agree with non-overlap one";
+ }
+ }
+ return true;
+}
+
+// code reviewed by Chen on 4/3/2016
+bool DiploidVCF::FindBestMatch(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ vector<vector<int>> max_choices[], // 4 vectors
+ int & max_score,
+ bool & max_heterozygosity,
+ string max_paths[])
+{
+ set<int> selected_positions[2];
+ vector<int> positions[2]; // 0 from ref, 1 from query
+ vector<bool> indicators[2]; // 0 from ref, 1 from query, indicate if multi_alts(true) or not(false)
+ for (int i = 0; i < 2; i++) {
+ for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+ auto v = it->second;
+ positions[i].push_back(v.pos);
+ indicators[i].push_back(v.multi_alts);
+ }
+ }
+ // construct ref combinations in hash table, key is donor sequence
+ unordered_map<string, vector<vector<int>> > seq_choice_ref;
+ unordered_map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+ for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+ vector<vector<vector<int>>> ref_choice_list = Combine(positions[0], indicators[0], i);
+
+ for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+ // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+ string donor;
+ int score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], *rit, donor, score);
+ if(CompareSequence(donor, subsequence)) continue;
+ if(seq_choice_ref.find(donor) != seq_choice_ref.end() && seq_score_ref[donor] > score){
+ continue;
+ }else{
+ // either overwrite or insert new
+ seq_choice_ref[donor] = *rit;
+ seq_score_ref[donor] = score;
+ }
+ //dout << "ref-donor: " << donor << endl;
+ }
+ }
+ // now all combinations are stored in hash table seq_choice_ref
+ // search query
+ for(int i = 1; i <= positions[1].size(); i++){
+ // iterate all combinations with i variants
+ vector<vector<vector<int>>> que_choice_list = Combine(positions[1], indicators[1], i);
+ for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+ string donor;
+ int score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], *qit, donor, score);
+ if(CompareSequence(donor, subsequence)) continue;
+
+ if(seq_choice_ref.find(donor) != seq_choice_ref.end()){
+ // first check if there is heterozygous alleles
+ int total_score = seq_score_ref[donor] + score;
+
+
+
+ vector<vector<int>> ref_var_choices = seq_choice_ref[donor];
+ vector<vector<int>> que_var_choices = *qit;
+
+ // for debug====================
+ if(total_score <= 0) continue;
+ if(! match_genotype){
+ for(int k = 0; k < ref_var_choices.size(); k++){
+ selected_positions[0].insert(ref_var_choices[k][0]);
+ }
+ for(int k = 0; k < que_var_choices.size(); k++){
+ selected_positions[1].insert(que_var_choices[k][0]);
+ }
+ }
+ // for debug====================
+
+ if(total_score <= max_score) continue;
+
+ bool local_heter = false;
+ bool local_multi = false;
+
+
+ if(! match_genotype){
+ max_choices[0] = ref_var_choices;
+ max_choices[2] = que_var_choices;
+ max_paths[0] = donor;
+ max_score = total_score;
+ max_heterozygosity = false;
+ continue;
+ }
+
+ vector<vector<int>> ref_other_choices;
+ vector<vector<int>> que_other_choices;
+ // check and construct heterozygous alleles
+ for(int ri = 0; ri < ref_var_choices.size(); ri++){
+ int ref_pos = ref_var_choices[ri][0];
+ DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+ if (ref_variant.multi_alts){
+ local_multi = true;
+ ref_other_choices.push_back(vector<int>({ref_pos, 1 - ref_var_choices[ri][1]}));
+ }else if(ref_variant.heterozygous){
+ local_heter = true;
+ ref_other_choices.push_back(vector<int>({ref_pos,-1}));
+ }else{
+ ref_other_choices.push_back(vector<int>({ref_pos, ref_var_choices[ri][1]}));
+ }
+ }
+ // if not find heter, continue checking
+ for(int qi = 0; qi < que_var_choices.size(); qi++){
+ int que_pos = que_var_choices[qi][0];
+ DiploidVariant que_variant = separate_pos_var[1][que_pos];
+ if(que_variant.multi_alts){
+ local_multi = true;
+ que_other_choices.push_back(vector<int>({que_pos, 1- que_var_choices[qi][1]}));
+ }else if (que_variant.heterozygous){
+ local_heter = true;
+ que_other_choices.push_back(vector<int>({que_pos, -1}));
+ }else{
+ que_other_choices.push_back(vector<int>({que_pos, que_var_choices[qi][1]}));
+ }
+ }
+
+
+ if(local_multi){
+ // also check the other chromosome matches
+ int temp_score;
+ string ref_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+ string que_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+ if(CompareSequence(ref_other_donor, que_other_donor)){
+ max_choices[0] = ref_var_choices;
+ max_choices[1] = ref_other_choices;
+ max_choices[2] = que_var_choices;
+ max_choices[3] = que_other_choices;
+ max_paths[0] = donor;
+ max_paths[1] = ref_other_donor;
+ max_score = total_score;
+ max_heterozygosity = true;
+ }
+ }else if(local_heter){
+ // also check the other chromosome matches
+ int temp_score;
+ string ref_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+ string que_other_donor;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+ if(CompareSequence(ref_other_donor, que_other_donor)){
+ max_choices[0] = ref_var_choices;
+ max_choices[2] = que_var_choices;
+ max_paths[0] = donor;
+ max_score = total_score;
+ max_heterozygosity = false;
+ }
+ }else{
+ max_choices[0] = ref_var_choices;
+ max_choices[2] = que_var_choices;
+ max_paths[0] = donor;
+ max_score = total_score;
+ max_heterozygosity = false;
+ //delay construct optimal solution at the very end.
+ }
+ }
+ }
+ }
+ if(max_score > 0){
+ if(max_choices[0].size() < selected_positions[0].size() || max_choices[2].size() < selected_positions[1].size()){
+ //dout << "overlap match differs!" << endl;
+ }
+
+ return true;
+ }
+ return false;
+}
+
+vector<vector<vector<int>>> DiploidVCF::DiploidCombine(vector<int> & positions,
+ vector<bool> & heter_indicators,
+ vector<bool> & multi_indicators,
+ int k) {
+ vector<vector<int>> sol;
+ vector<vector<vector<int>>> all_sol;
+ if (k == 0 || k > positions.size()) {
+ return all_sol;
+ }
+ FindDiploidComb(positions,
+ heter_indicators,
+ multi_indicators,
+ 0,
+ k,
+ sol,
+ all_sol);
+ return all_sol;
+}
+
+void DiploidVCF::FindDiploidComb(vector<int> & positions,
+ vector<bool> & heter_indicators,
+ vector<bool> & multi_indicators,
+ int start,
+ int k,
+ vector<vector<int> > & sol,
+ vector<vector<vector<int>>> & all_sol)
+{
+ if (k == 0) {
+ all_sol.push_back(sol);
+ return;
+ }
+ int n = positions.size();
+ for (int i = start; i <= n - k; i++) {
+ sol.push_back(vector<int>({ positions[i], 0 }));
+ FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+ sol.pop_back();
+ if (heter_indicators[i]) { // try second allele
+ int second_allele = -1;
+ if(multi_indicators[i]){
+ second_allele = 1;
+ }
+ sol.push_back(vector<int>({ positions[i], second_allele }));
+ FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+ sol.pop_back();
+ }
+ }
+}
+
+bool DiploidVCF::FindBestDiploidMatch(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ vector<vector<int>> max_choices[], // 4 vectors
+ int & max_score,
+ bool & max_heterozygosity,
+ string max_paths[]){
+
+ vector<int> positions[2]; // 0 from ref, 1 from query
+ vector<bool> heter_indicators[2]; // 0 from ref, 1 from query, indicate if heterozygous(true) or not(false)
+ vector<bool> multi_indicators[2]; // indicate if contains multi alt, if heter but not multi, then the other choice is ref(-1)
+ for (int i = 0; i < 2; i++) {
+ for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+ DiploidVariant v = it->second;
+ positions[i].push_back(v.pos);
+ heter_indicators[i].push_back(v.heterozygous);
+ multi_indicators[i].push_back(v.multi_alts);
+ }
+ }
+
+ map<string, vector<vector<int>> > seq_choice1_ref;
+ map<string, vector<vector<int>> > seq_choice2_ref;
+ map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+ for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+ vector<vector<vector<int>>> ref_choice_list = DiploidCombine(positions[0], heter_indicators[0], multi_indicators[0], i);
+
+ for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+ // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+ vector<vector<int>> one_choice = *rit;
+ vector<vector<int>> another_choice;
+ // generate another choice;
+ bool multi_chr = false;
+ for(int ri = 0; ri < one_choice.size(); ri++){
+ int ref_pos = one_choice[ri][0];
+ DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+ if (ref_variant.multi_alts){
+ multi_chr = true;
+ another_choice.push_back(vector<int>({ref_pos, 1 - one_choice[ri][1]}));
+ }else if(ref_variant.heterozygous){
+ multi_chr = true;
+ int another_allele = -1;
+ if(one_choice[ri][1] == -1) another_allele = 0;
+ another_choice.push_back(vector<int>({ref_pos,another_allele}));
+ }else{
+ another_choice.push_back(vector<int>({ref_pos, one_choice[ri][1]}));
+ }
+ }
+ string one_donor;
+ string another_donor;
+ int one_score;
+ int another_score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], one_choice, one_donor, one_score);
+ if(multi_chr){
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], another_choice, another_donor, another_score);
+ }else{
+ another_donor = one_donor;
+ }
+ string donor;
+ if(one_donor < another_donor){
+ donor = one_donor + "," + another_donor;
+ }else{
+ donor = another_donor + "," + one_donor;
+ }
+ // key will be donor string
+ if(CompareSequence(donor, subsequence+","+subsequence)) continue;
+ int score = one_score;
+
+ if(seq_choice1_ref.find(donor) != seq_choice1_ref.end() && seq_score_ref[donor] > score){
+ continue;
+ }else{
+ // either overwrite or insert new
+ seq_choice1_ref[donor] = one_choice;
+ seq_choice2_ref[donor] = another_choice;
+ seq_score_ref[donor] = score;
+ }
+ //dout << "ref-donor: " << donor << endl;
+ }
+ }
+
+ // by now generate all combinations of ref variant set, with sorted donor sequences as key
+ for(int i = 1; i <= positions[1].size(); i++){
+ // iterate all combinations with i variants
+ vector<vector<vector<int>>> que_choice_list = DiploidCombine(positions[1], heter_indicators[1], multi_indicators[1], i);
+ for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+ vector<vector<int>> one_choice = *qit;
+ vector<vector<int>> another_choice;
+ bool multi_chr = false;
+ for(int qi = 0; qi < one_choice.size(); qi++){
+ int que_pos = one_choice[qi][0];
+ DiploidVariant que_variant = separate_pos_var[1][que_pos];
+ if(que_variant.multi_alts){
+ multi_chr = true;
+ another_choice.push_back(vector<int>({que_pos, 1- one_choice[qi][1]}));
+ }else if (que_variant.heterozygous){
+ multi_chr = true;
+ int another_allele = -1;
+ if(one_choice[qi][1] == -1) another_allele = 0;
+ another_choice.push_back(vector<int>({que_pos, another_allele}));
+ }else{
+ another_choice.push_back(vector<int>({que_pos, one_choice[qi][1]}));
+ }
+ }
+ string one_donor;
+ string another_donor;
+ int one_score;
+ int another_score;
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], one_choice, one_donor, one_score);
+ if(multi_chr){
+ ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], another_choice, another_donor, another_score);
+ }else{
+ another_donor = one_donor;
+ }
+ string donor;
+ if(one_donor < another_donor){
+ donor = one_donor + "," + another_donor;
+ }else{
+ donor = another_donor + "," + one_donor;
+ }
+
+ if(seq_choice1_ref.find(donor) != seq_choice1_ref.end()){
+ int total_score = seq_score_ref[donor] + one_score;
+ if (total_score > max_score){
+ max_score = total_score;
+ max_paths[0] = one_donor;
+ max_paths[1] = another_donor;
+ max_heterozygosity = multi_chr;
+ max_choices[0] = seq_choice1_ref[donor];
+ max_choices[1] = seq_choice2_ref[donor];
+ max_choices[2] = one_choice;
+ max_choices[3] = another_choice;
+ }
+ }
+ }
+ }
+ return true;
+}
+
+//[todo] support variant match without hyplotype
+
+// code reviewed by Channing on 4/3/2016
+void DiploidVCF::ModifyRefMultiVar(const string & genome,
+ int offset,
+ map<int, DiploidVariant> & pos_var,
+ vector<vector<int>> pos_choice,
+ string & donor,
+ int & score) {
+ donor = genome;
+ score = 0; // if return before end of function, score = 0
+ int local_score = 0;
+ transform(donor.begin(), donor.end(), donor.begin(), ::toupper);
+ int start_pos = 0;
+ std::sort(pos_choice.begin(), pos_choice.end(),
+ [](const std::vector<int>& a, const std::vector<int>& b) {
+ return a[0]>b[0];}); // sorted by position in reverse order
+ for(int i = 0; i < pos_choice.size(); i++){
+ assert(pos_choice[i].size() == 2);
+ int pos = pos_choice[i][0];
+ int alt_index = pos_choice[i][1];
+ int offset_pos = pos - offset;
+ DiploidVariant variant = pos_var[pos];
+ if(alt_index > 0 && !variant.multi_alts){
+ dout << "[VarMatch] Warning: modify reference genome with allele not exist" << endl;
+ return;
+ }
+ int offset_end = offset_pos + (int) variant.ref.length();
+ string alt = "";
+ if(alt_index >= 0){
+ alt = variant.alts[alt_index];
+ }else{
+ alt = variant.ref;
+ }
+ int donor_length = donor.length();
+ if(offset_pos > donor_length || offset_end > donor_length){
+ //dout << "[VarMatch] Warning: overlapping variants detected." << endl; // the most reason is overlapping variants
+ return;
+ }
+ donor = donor.substr(0, offset_pos) + alt + donor.substr(offset_end, donor_length - offset_end);
+ if(scoring_basepair){
+ local_score += variant.ref.size();
+ }else{
+ local_score++;
+ }
+ }
+ transform(donor.begin(), donor.end(), donor.begin(), ::toupper);
+ //only assign score here, if fail to change reference, score will be 0
+ score = local_score;
+ return;
+}
+
+int DiploidVCF::test() {
+// genome_sequence = "GTCAGCCGG";
+// DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0);
+// DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0);
+// DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0); // this is false negative
+// DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0);
+// DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0);
+// DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 1);
+// DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1);
+// DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 1);
+//
+// complex_ref_match_num.push_back(0);
+// complex_que_match_num.push_back(0);
+// complex_match_records = new vector<string>*[1];
+// complex_match_records[0] = new vector<string>;
+// //vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+// vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+// cout << VariantMatchPathCreation(var_list, 0,0) << endl;
+ return 0;
+}
+
+//int DiploidVCF::test() {
+// genome_sequence = "AATATAT";
+//
+// DiploidVariant d1(0, vector<char>({ 'D', 'S' }), "AAT", vector<string>({ "A", "A" }), "1/2", false, false, 0);
+// DiploidVariant d2(0, vector<char>({ 'D', 'S' }), "AAT", vector<string>({ "A", "" }), "0/1", true, false, 1);
+// DiploidVariant d3(4, vector<char>({ 'D', 'S' }), "TAT", vector<string>({ "T", "" }), "0/1", true, false, 1);
+//
+// complex_ref_match_num.push_back(0);
+// complex_que_match_num.push_back(0);
+// complex_match_records = new vector<string>*[1];
+// complex_match_records[0]= new vector<string>;
+// //vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+// vector<DiploidVariant> var_list = { d1,d2,d3 };
+// cout << VariantMatchPathCreation(var_list, 0) << endl;
+// return 0;
+//}
+
+void DiploidVCF::SortVariantList(){
+ sort(variant_list.begin(), variant_list.end());
+}
+
+// code reviewed by Chen on 4/4/2016
+void DiploidVCF::ClusteringVariants() {
+
+
+ // in DiploidVariant, flag = 0 is reference, flag = 1 is query
+// for (int i = 0; i < refpos_2_var.size(); i++) {
+// auto & m = refpos_2_var[i];
+// for (auto it = m.begin(); it != m.end(); ++it) {
+//
+// auto v = it->second;
+// if (v.flag != 0) {
+// v.flag = 0;
+// }
+// //if(v.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+// variant_list.push_back(v);
+// }
+// }
+//
+// for (int i = 0; i < querypos_2_var.size(); i++) {
+// auto & m = querypos_2_var[i];
+// for (auto it = m.begin(); it != m.end(); ++it) {
+// auto v = it->second;
+// v.flag = 1;
+// variant_list.push_back(v);
+// }
+// }
+//
+// if (variant_list.size() == 0)
+// return;
+//
+ dsptime();
+ sort(variant_list.begin(), variant_list.end());
+ dsptime();
+
+ int cluster_index = 0;
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = 0;
+ int c_end = 0;
+
+ for (int i = 0; i < variant_list.size(); i++) {
+ auto snp = variant_list[i];
+ // check if need to separator clusters
+ if (i > 0) {
+ c_end = snp.pos;
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequence.substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+
+ }
+
+ if(separate_cluster){
+ cluster_index++;
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+
+ // assign snp to cluster
+ //if(snp.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+ cluster_vars_map[cluster_index].push_back(snp);
+ int ref_length = (int)(snp.ref.length());
+
+ int flag = snp.flag;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ int snp_ins = max(0, (int)snp.alts[0].length() - (int)snp.ref.length());
+ int snp_del = max(0, (int)snp.ref.length() - (int)snp.alts[0].length());
+ if(snp.multi_alts){
+ int snp_ins = max(snp_ins, (int)snp.alts[1].length() - (int)snp.ref.length());
+ int snp_del = max(snp_del, (int)snp.ref.length() - (int)snp.alts[1].length());
+ }
+ ins_len[flag] += snp_ins;
+ del_len[flag] += snp_del;
+ }
+}
+
+void DiploidVCF::LinearClusteringVariants() {
+ int cluster_index = 0;
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = 0;
+ int c_end = 0;
+ sort(ref_variant_list.begin(), ref_variant_list.end());
+ sort(que_variant_list.begin(), que_variant_list.end());
+ int ref_size = ref_variant_list.size();
+ int que_size = que_variant_list.size();
+
+ int ref_index = 0;
+ int que_index = 0;
+ bool not_first = false;
+ DiploidVariant snp;
+ while (ref_index < ref_size || que_index < que_size) {
+ bool take_que = true;
+ if(ref_index < ref_size && que_index < que_size){
+ if(ref_variant_list[ref_index].pos < que_variant_list[que_index].pos){
+ take_que = false;
+ }
+ }else if(ref_index < ref_size){
+ take_que = false;
+ }
+
+ if(take_que){
+ snp = que_variant_list[que_index];
+ //cout << "q |" << que_index << "," << snp.pos << endl;
+ que_index++;
+ }else{
+ snp = ref_variant_list[ref_index];
+ //cout << "r |" << ref_index << "," << snp.pos << endl;
+ ref_index++;
+ }
+ // check if need to separator clusters
+ if (not_first) {
+ c_end = snp.pos;
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequence.substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+
+ }
+
+ if(separate_cluster){
+ cluster_index++;
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+
+ // assign snp to cluster
+ if(snp.pos == 142536905) cout << cluster_index << endl;
+ cluster_vars_map[cluster_index].push_back(snp);
+ if(!not_first) not_first = true;
+
+ int ref_length = (int)(snp.ref.length());
+
+ int flag = snp.flag;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ ins_len[flag] += snp.mil;
+ del_len[flag] += snp.mdl;
+ }
+}
+
+
+void DiploidVCF::ReverseLinearClusteringVariants() {
+ int cluster_index = 0;
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = std::numeric_limits<int>::max();
+ int c_end = std::numeric_limits<int>::max();
+
+ sort(ref_variant_list.begin(), ref_variant_list.end());
+ sort(que_variant_list.begin(), que_variant_list.end());
+
+ int ref_size = ref_variant_list.size();
+ int que_size = que_variant_list.size();
+
+ int ref_index = ref_size-1;
+ int que_index = que_size-1;
+ bool not_first = false;
+ DiploidVariant snp;
+ while (ref_index >= 0 || que_index >= 0) {
+ bool take_que = true;
+ if(ref_index >= 0 && que_index >= 0){
+ if(ref_variant_list[ref_index].pos + ref_variant_list[ref_index].ref.size() > que_variant_list[que_index].pos+que_variant_list[que_index].ref.size()){
+ take_que = false;
+ }
+ }else if(ref_index >= 0){
+ take_que = false;
+ }
+
+ if(take_que){
+ snp = que_variant_list[que_index];
+ que_index--;
+ }else{
+ snp = ref_variant_list[ref_index];
+ ref_index--;
+ }
+
+ // check if need to separator clusters
+ if (not_first) {
+ c_start = snp.pos + snp.ref.size();
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequence.substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+
+ }
+
+ if(separate_cluster){
+ cluster_index++;
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_end = std::numeric_limits<int>::max(); // re-assign c_start
+ }
+ }
+ }
+ c_end = min(c_end, snp.pos);
+
+ // assign snp to cluster
+ //if(snp.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+ cluster_vars_map[cluster_index].push_back(snp);
+ if(!not_first) not_first = true;
+
+ int ref_length = (int)(snp.ref.length());
+
+ int flag = snp.flag;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ ins_len[flag] += snp.mil;
+ del_len[flag] += snp.mdl;
+ }
+}
+
+
+void DiploidVCF::DivisiveHierarchicalClustering(list<vector<DiploidVariant>> & snp_clusters){
+ // I use list of vectors instead of vector of vectors, to take advantage of member func of list
+ if(snp_clusters.size() == 0) return;
+ bool flag = true;
+ list<bool> potential_list;
+ for(int i = 0; i < snp_clusters.size(); i++){
+ potential_list.push_back(true);
+ }
+ int previous_variant_num = snp_clusters.front().size();
+ while(flag){
+ flag = false;
+ int list_size = snp_clusters.size();
+ for(int i = 0; i < list_size; i++){
+ auto front_cluster = snp_clusters.front();
+ auto front_posential = potential_list.front(); // record if this can be separated
+ // at the very beginning, all clusters can be separated
+ // all newly separated sub-clusters can be separated
+ // if one cluster marked not separated, then it can never be separated
+
+ snp_clusters.pop_front();
+ potential_list.pop_front();
+ if(front_cluster.size() == 1){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+
+ if(! front_posential){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(front_posential);
+ continue;
+ }
+ int max_start = -1;
+ int max_end = -1;
+ int max_length = -1;
+ int start = front_cluster[0].pos + (int)front_cluster[0].ref.length();
+ // find the largest gap, see if we can separate from that gap
+ for(int k = 0; k < front_cluster.size(); k++){
+ auto snp = front_cluster[k];
+ auto snp_pos = snp.pos;
+ if(max_length < snp_pos - start){
+ max_length = snp_pos - start;
+ max_start = start;
+ max_end = snp_pos;
+ }
+ }
+
+ if(max_length <= 0){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+ int left_ins[2] = {0};
+ int left_del[2] = {0};
+ int right_ins[2] = {0};
+ int right_del[2] = {0};
+ vector<DiploidVariant> left_snp_list;
+ vector<DiploidVariant> right_snp_list;
+ string separator = genome_sequence.substr(max_start, max_end-max_start);
+ for(int k = 0; k < front_cluster.size(); k++){
+ DiploidVariant snp = front_cluster[k];
+ int rq = snp.flag;
+ int snp_ins = max(0, (int)snp.alts[0].length() - (int)snp.ref.length());
+ int snp_del = max(0, (int)snp.ref.length() - (int)snp.alts[0].length());
+ if(snp.multi_alts){
+ int snp_ins = max(snp_ins, (int)snp.alts[1].length() - (int)snp.ref.length());
+ int snp_del = max(snp_del, (int)snp.ref.length() - (int)snp.alts[1].length());
+ }
+ if(snp.pos <= max_start){
+ left_ins[rq] += snp_ins;
+ left_del[rq] += snp_del;
+ left_snp_list.push_back(snp);
+ }else{
+ right_ins[rq] += snp_ins;
+ right_del[rq] += snp_del;
+ right_snp_list.push_back(snp);
+ }
+ }
+ //check
+ if(left_snp_list.size() == 0 || right_snp_list.size() == 0){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+
+ int mcll = max(left_del[0]+left_ins[1], left_del[1]+left_ins[0]);
+ int mclr = max(right_del[0]+right_ins[1], right_del[1]+right_ins[0]);
+ int min_mcl = min(mcll, mclr);
+
+ if ((int)separator.length() > 2 * min_mcl &&
+ ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, min_mcl)))
+ {
+ flag = true;
+ snp_clusters.push_back(left_snp_list);
+ potential_list.push_back(true);
+ snp_clusters.push_back(right_snp_list);
+ potential_list.push_back(true);
+ }else{
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+ }
+ int current_variant_num = 0;
+ for(auto it = snp_clusters.begin(); it != snp_clusters.end(); ++it){
+ current_variant_num += (*it).size();
+ }
+ if(current_variant_num != previous_variant_num){
+ dout << "[VarMatch] Error during clustering" << endl;
+ }
+ }
+ return;
+}
+
+// private
+// code reviewed
+bool DiploidVCF::ClusteringMatchInThread(int start, int end, int thread_index) {
+ // end exclusive
+
+ map<int, int> size_of_cluster;
+
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+ auto & var_list = cluster_vars_map[cluster_id];
+// int var_list_size = var_list.size();
+//
+// if(size_of_cluster.find(var_list_size) != size_of_cluster.end()){
+// size_of_cluster[var_list_size] ++;
+// }else{
+// size_of_cluster[var_list_size] = 1;
+// }
+//
+ if (var_list.size() > 100){
+ cout << cluster_id << ":" ;
+ cout << var_list.size() << endl;
+ }
+ //bool method1 = VariantMatchPathCreationByDonor(var_list, thread_index, cluster_id);
+ bool method2 = AcceleratedVariantMatchPathCreation(var_list, thread_index, cluster_id);
+// if(method1 != method2){
+// cout << "not match" << endl;
+// }
+ }
+ }
+
+// for(auto it = size_of_cluster.begin(); it != size_of_cluster.end(); ++it){
+// cout << it->first << "\t" << it->second << endl;
+// }
+ return true;
+}
+
+// private
+void DiploidVCF::ClusteringMatchMultiThread() {
+ clustering_search = true;
+ int start = cluster_vars_map.begin()->first; // start cluster id
+ int cluster_number = cluster_vars_map.size(); // cluster number
+ int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+ int cluster_step = cluster_number / thread_num; // assign clusters to threads
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+ //initialize vector size
+ complex_match_records = new vector<string>*[thread_num];
+ for (int j = 0; j < thread_num; j++) {
+ complex_match_records[j] = new vector<string>;
+ complex_ref_match_num.push_back(0);
+ complex_que_match_num.push_back(0);
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+ variant_number += cluster_vars_map[cluster_id].size();
+ }
+ }
+ threads.push_back(thread(&DiploidVCF::ClusteringMatchInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= cluster_vars_map.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+ variant_number += cluster_vars_map[cluster_id].size();
+ }
+ }
+ ClusteringMatchInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_complex_filename);
+ output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+ output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+ for (int i = 0; i < thread_num; i++) {
+ for (int j = 0; j < complex_match_records[i]->size(); j++) {
+ if (complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos) {
+ output_complex_file << chromosome_name << "\t" << complex_match_records[i]->at(j);
+ }
+ }
+ }
+ output_complex_file.close();
+
+ for (int j = 0; j < thread_num; j++) {
+ delete complex_match_records[j];
+ }
+ delete[] complex_match_records;
+
+ total_ref_complex = 0;
+ total_que_complex = 0;
+ for (int i = 0; i < complex_ref_match_num.size(); i++)
+ total_ref_complex += complex_ref_match_num[i];
+ for (int i = 0; i < complex_que_match_num.size(); i++)
+ total_que_complex += complex_que_match_num[i];
+
+ cout << "complex match: " << total_ref_complex << "," << total_que_complex << endl;
+}
+
+// for public access
+void DiploidVCF::Compare(string ref_vcf,
+ string query_vcf,
+ string genome_seq,
+ bool direct_search,
+ string output_prefix,
+ bool match_genotype,
+ bool normalization,
+ bool score_basepair,
+ bool overlap_match,
+ bool variant_check) {
+
+ ref_vcf_filename = ref_vcf;
+ que_vcf_filename = query_vcf;
+ this->normalization = normalization;
+ this->scoring_basepair = score_basepair;
+ this->overlap_match = overlap_match;
+ this->variant_check = variant_check;
+ this->match_genotype = match_genotype;
+ output_stat_filename = output_prefix + ".stat";
+ output_complex_filename = output_prefix + ".match";
+ //------------read genome sequence and decide boundary according to thread number
+ dsptime();
+ dout << " Read genome sequence file... " << endl;
+ ReadGenome(genome_seq);
+ dsptime();
+ dout << " Finish reading genome sequence file." << endl;
+ //------------read ref and query vcf file
+
+ int ref_total_num = 0;
+ int que_total_num = 0;
+
+ dsptime();
+ dout << " Read reference vcf file... " << endl;
+ ref_total_num = ReadRefVCF(ref_vcf);
+ dsptime();
+ dout << " Read query vcf file... " << endl;
+ que_total_num = ReadQueryVCF(query_vcf);
+ dsptime();
+ dout << " Finish reading all vcf file." << endl;
+ dout << " total variants: " << ref_total_num << "," << que_total_num << endl;
+ //-------------clustering search
+ dsptime();
+ dout << " Clustering snps ... " << endl;
+ LinearClusteringVariants();
+ dsptime();
+ dout << " Finish clustering." << endl;
+ dsptime();
+ dout << " Clustering search ... " << endl;
+ ClusteringMatchMultiThread();
+ dsptime();
+ dout << " Finish clustering search." << endl;
+ dout << " total match: " << total_ref_complex << "," << total_que_complex << endl;
+ int ref_mismatch_num = ref_total_num - total_ref_complex;
+ int que_mismatch_num = que_total_num - total_que_complex;
+ dout << " mismatch: " << ref_mismatch_num << "," << que_mismatch_num << endl;
+
+ ofstream output_stat_file;
+ output_stat_file.open(output_stat_filename);
+ output_stat_file << ref_total_num << endl;
+ output_stat_file << que_total_num << endl;
+ output_stat_file << total_ref_complex << endl;
+ output_stat_file << total_que_complex << endl;
+ output_stat_file << ref_mismatch_num << endl;
+ output_stat_file << que_mismatch_num << endl;
+ output_stat_file.close();
+ return;
+}
+
diff --git a/src/diploid.h b/src/diploid.h
new file mode 100644
index 0000000..2010082
--- /dev/null
+++ b/src/diploid.h
@@ -0,0 +1,342 @@
+#pragma once
+
+#include "vcf.h"
+
+// data structure for direct search
+class DiploidVariant {
+public:
+ DiploidVariant(int pos_ = -1,
+ string ref_ = "",
+ vector<string> alts_ = {"",""},
+ bool heterozygous_ = false,
+ bool multi_alts_ = false,
+ int mdl_ = 0,
+ int mil_ = 0,
+ int flag_ = 0) :
+ pos(pos_),
+ ref(ref_),
+ alts(alts_),
+ heterozygous(heterozygous_),
+ multi_alts(multi_alts_),
+ mdl(mdl_),
+ mil(mil_),
+ flag(flag_){}
+
+ int pos;
+ string ref;
+ vector<string> alts;
+ bool heterozygous;
+ bool multi_alts;
+ int mdl;
+ int mil;
+ int flag; //in DiploidVariant, flag = 0 is reference, flag = 1 is query
+
+// int get_pos() const{return pos};
+// string get_ref() const{return ref};
+// vector<string> get_alts() const{return alts};
+// bool get_heterozygous() const{return heterozygous};
+// bool get_multi_alts() const{return multi_alts};
+
+ bool operator <(const DiploidVariant& y) const {
+ return pos < y.pos;
+ }
+
+ // this is based on the assumption that all sequence are in upper case
+ bool operator ==(const DiploidVariant& y) {
+ if (pos == y.pos && ref == y.ref) {
+ if(heterozygous == y.heterozygous && multi_alts == y.multi_alts){
+ if (multi_alts && heterozygous) {
+ int match_times = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ if (alts[i] == y.alts[j])
+ match_times++;
+ }
+ }
+ if (match_times >= 2)
+ return true;
+ }
+ else if(alts[0] == y.alts[0]){
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ bool DirectCompare(const DiploidVariant& y){
+ if (pos == y.pos && ref == y.ref) {
+ if (multi_alts && heterozygous && y.multi_alts && y.heterozygous) {
+ int match_times = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ if (alts[i] == y.alts[j])
+ match_times++;
+ }
+ }
+ if (match_times > 0)
+ return true;
+ }
+ else if(alts[0] == y.alts[0]){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool CompareNoGenotype(const DiploidVariant & y){
+ if(pos == y.pos && ref == y.ref){
+ if(alts[0] == y.alts[0]) return true;
+ if(multi_alts){
+ if(alts[1] == y.alts[0]) return true;
+ if(y.multi_alts && alts[1] == y.alts[1]){
+ return true;
+ }
+ }
+ if(y.multi_alts && alts[0] == y.alts[1]){
+ return true;
+ }
+ }
+ return false;
+ }
+
+};
+
+// define outside of struct, idiomatic solution for lexicographical compare for structures
+//bool operator <(const DiploidVariant& x, const DiploidVariant& y);
+
+//bool operator ==(const DiploidVariant& x, const DiploidVariant& y);
+
+class VariantSelection{
+public:
+ int score;
+ int separate_score[2];
+ int min_genome_pos; // min(donor_sequence[0], donor_sequence[2])
+ bool haplotypes_consistent;
+ int genome_position[2]; // genome position that has been considered, exclusive
+ int donor_length[2];
+ string donor_sequences[4];
+ vector<int> pos_vectors[2]; // selected variants, not necessary now
+ vector<int> phasing_vectors[2]; // phasing vector for corresponding variant, for D_0
+ int cur_var;
+ bool overlap_detected;
+
+ VariantSelection(){
+ score = 0;
+ cur_var = -1;
+ min_genome_pos = -1;
+ haplotypes_consistent = false;
+ overlap_detected = false;
+ for(int i = 0; i < 2; i++){
+ separate_score[i] = 0;
+ genome_position[i] = 0;
+ donor_length[i] = 0;
+ pos_vectors[i] = vector<int>();
+ phasing_vectors[i] = vector<int>();
+ donor_sequences[i] = "";
+ }
+ donor_sequences[2] = "";
+ donor_sequences[3] = "";
+ }
+
+ bool operator< (const VariantSelection& rhs) const // sort by min_genome_position
+ {
+ return min_genome_pos < rhs.min_genome_pos;
+ }
+};
+
+class DiploidVCF : public VCF
+{
+private:
+
+ typedef vector<unordered_map<int, DiploidVariant > > VariantHash;
+ typedef vector<map<int, DiploidVariant > > VariantMap;
+
+ VariantHash refpos_2_var;
+ VariantHash querypos_2_var;
+
+ vector<DiploidVariant> variant_list;
+ vector<DiploidVariant> ref_variant_list;
+ vector<DiploidVariant> que_variant_list;
+
+ int ReadRefVCF(string filename);
+ int ReadQueryVCF(string filename);
+
+ void DirectSearchInThread(unordered_map<int, DiploidVariant> & ref_snps,
+ unordered_map<int, DiploidVariant> & query_snps,
+ int thread_index);
+ void DirectSearchMultiThread();
+ void ClusteringVariants();
+ bool ClusteringMatchInThread(int, int, int);
+ void ClusteringMatchMultiThread();
+
+ ofstream offf;
+ const time_t ctt = time(0);
+
+protected:
+
+ vector<int> complex_ref_match_num;
+ vector<int> complex_que_match_num;
+ int total_ref_complex;
+ int total_que_complex;
+
+ bool scoring_basepair;
+ bool overlap_match;
+ bool variant_check;
+ map<int, vector<DiploidVariant> > cluster_vars_map;
+
+ void DecideBoundaries();
+
+ int ReadDiploidVCF(string filename, vector<DiploidVariant> & x_variant_list, int flag);
+ bool NormalizeDiploidVariant(DiploidVariant & var);
+
+ bool VariantMatch(vector<DiploidVariant> & variant_list, int thread_index);
+
+ bool FindBestMatch(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ vector<vector<int>> max_choices[], // 4 vectors
+ int & max_score,
+ bool & max_heterozygosity,
+ string max_paths[]); //only two
+
+ bool FindBestDiploidMatch(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ vector<vector<int>> max_choices[], // 4 vectors
+ int & max_score,
+ bool & max_heterozygosity,
+ string max_paths[]); //only two
+
+ int CheckPrefix(const string subsequence,
+ const int offset,
+ map<int, DiploidVariant> separate_pos_var[],
+ map<int, int> choices[],
+ string cur_paths[]);
+
+ bool RecurrentVariantMatch(vector<DiploidVariant> & variant_list, int thread_index);
+ void RecurrentMatchWithIndel(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ map<int, int> choices[], // 4 vectors
+ map<int, int> max_matches[], // 4 vectors
+ int & max_score,
+ string max_paths[]);
+
+ vector<vector<vector<int>>> Combine(vector<int> & positions,
+ vector<bool> & multi_indicators,
+ int k);
+
+ void FindComb(vector<int> & positions,
+ vector<bool> & multi_indicators,
+ int start,
+ int k,
+ vector<vector<int> > & sol,
+ vector<vector<vector<int>>> & all_sol);
+
+ vector<vector<vector<int>>> DiploidCombine(vector<int> & positions,
+ vector<bool> & heter_indicators,
+ vector<bool> & multi_indicators,
+ int k);
+
+ void FindDiploidComb(vector<int> & positions,
+ vector<bool> & heter_indicators,
+ vector<bool> & multi_indicators,
+ int start,
+ int k,
+ vector<vector<int> > & sol,
+ vector<vector<vector<int>>> & all_sol);
+
+ void ModifyRefMultiVar(const string & ref,
+ int offset,
+ map<int, DiploidVariant> & pos_var,
+ vector<vector<int>> pos_choice,
+ string & donor,
+ int & score);
+
+ void DivisiveHierarchicalClustering(list<vector<DiploidVariant> > & snp_clusters);
+
+ bool VariantMatchWithOverlap(vector<DiploidVariant> & variant_list, int thread_index);
+
+ bool FindBestMatchWithOverlap(vector<DiploidVariant> & variant_list,
+ const string subsequence,
+ const int offset,
+ int index,
+ map<int, DiploidVariant> separate_pos_var[],
+ map<int, int> selected_positions[]);
+
+ bool VariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+ bool CollapseSelections(VariantSelection selection,
+ list<VariantSelection> & variant_selections);
+
+ int CheckDonorSequences(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ string donor_sequences[]);
+
+ bool AddVariantToSelection(list<VariantSelection> & variant_selections,
+ VariantSelection selection,
+ DiploidVariant variant,
+ int haplotype,
+ vector<DiploidVariant> separate_var_list[],
+ const string & subsequence,
+ int offset,
+ VariantSelection & best_selection);
+
+ void SortVariantList();
+ void ReadGenome(string filename);
+ void LinearClusteringVariants();
+
+ int NormalizeVariantSequence(int pos,
+ string & parsimonious_ref,
+ string & parsimonious_alt0,
+ string & parsimonious_alt1);
+
+ int ExtendingDonorSequences(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ int flag);
+
+ bool CollapsePrefixMatchSelection(VariantSelection selection,
+ list<VariantSelection> & variant_selections);
+
+ void ReverseLinearClusteringVariants();
+ bool AcceleratedVariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+ bool VariantMatchPathCreationByDonor(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+
+ int CheckDonorSequencesWithOverlap(vector<DiploidVariant> separate_var_list[],
+ VariantSelection & selection,
+ const string & subsequence,
+ int offset,
+ string donor_sequences[]);
+
+ void PrintVariant(DiploidVariant var);
+
+public:
+ DiploidVCF(int thread_num_);
+ ~DiploidVCF();
+
+ const static int VAR_LEN = 100;
+
+ int test();
+ // for public access
+ void Compare(string ref_vcf,
+ string query_vcf,
+ string genome_seq,
+ bool direct_search,
+ string output_prefix,
+ bool match_genotype,
+ bool normalization,
+ bool scoring_basepair,
+ bool overlap_match,
+ bool variant_check);
+
+};
diff --git a/src/diploidvariant.h b/src/diploidvariant.h
new file mode 100644
index 0000000..cd3acdd
--- /dev/null
+++ b/src/diploidvariant.h
@@ -0,0 +1,117 @@
+// data structure for direct search
+class DiploidVariant {
+public:
+ DiploidVariant(int pos_ = -1,
+ string ref_ = "",
+ vector<string> alts_ = {"",""},
+ bool heterozygous_ = false,
+ bool multi_alts_ = false,
+ int mdl_ = 0,
+ int mil_ = 0,
+ bool flag_ = false,
+ double qual_ = 0.0,
+ bool zero_one_var_ = false) :
+ pos(pos_),
+ ref(ref_),
+ alts(alts_),
+ heterozygous(heterozygous_),
+ multi_alts(multi_alts_),
+ mdl(mdl_),
+ mil(mil_),
+ flag(flag_),
+ qual(qual_),
+ zero_one_var(zero_one_var_){}
+
+ int pos;
+ string ref;
+ vector<string> alts;
+ bool heterozygous;
+ bool multi_alts;
+ bool zero_one_var; // which means the phasing should be 0/1 or 1/0, no matter if it contains multi_alts
+ // i.e. multi_alts does not mean that it is 1/2 or 2/1
+ int mdl;
+ int mil;
+ bool flag; //in DiploidVariant, flag = false is reference, flag = true is query
+ // keep flag as int? not necessary
+ double qual;
+
+// int get_pos() const{return pos};
+// string get_ref() const{return ref};
+// vector<string> get_alts() const{return alts};
+// bool get_heterozygous() const{return heterozygous};
+// bool get_multi_alts() const{return multi_alts};
+
+ bool operator <(const DiploidVariant& y) const {
+ return pos < y.pos;
+ }
+
+ // this is based on the assumption that all sequence are in upper case
+ bool operator ==(const DiploidVariant& y) {
+ if (pos == y.pos && ref == y.ref) {
+ if(heterozygous == y.heterozygous && multi_alts == y.multi_alts){
+ if (multi_alts && heterozygous) {
+ int match_times = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ if (alts[i] == y.alts[j])
+ match_times++;
+ }
+ }
+ if (match_times >= 2)
+ return true;
+ }
+ else if(alts[0] == y.alts[0]){
+ return true;
+ }
+ }
+ if(multi_alts && zero_one_var && y.multi_alts && y.zero_one_var){
+ int match_times = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ if (alts[i] == y.alts[j])
+ match_times++;
+ }
+ }
+ if(match_times > 1) return true;
+ }
+ }
+ return false;
+ }
+
+ bool DirectCompare(const DiploidVariant& y){
+ if (pos == y.pos && ref == y.ref) {
+ if (multi_alts && heterozygous && y.multi_alts && y.heterozygous) {
+ int match_times = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ if (alts[i] == y.alts[j])
+ match_times++;
+ }
+ }
+ if (match_times > 0)
+ return true;
+ }
+ else if(alts[0] == y.alts[0]){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool CompareNoGenotype(const DiploidVariant & y){
+ if(pos == y.pos && ref == y.ref){
+ if(alts[0] == y.alts[0]) return true;
+ if(multi_alts){
+ if(alts[1] == y.alts[0]) return true;
+ if(y.multi_alts && alts[1] == y.alts[1]){
+ return true;
+ }
+ }
+ if(y.multi_alts && alts[0] == y.alts[1]){
+ return true;
+ }
+ }
+ return false;
+ }
+
+};
diff --git a/src/filter_cv.cpp b/src/filter_cv.cpp
new file mode 100644
index 0000000..87eb773
--- /dev/null
+++ b/src/filter_cv.cpp
@@ -0,0 +1,245 @@
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <tclap/CmdLine.h>
+#include "util.h"
+
+using namespace std;
+
+// all intervals are 0 based coordinate
+
+typedef struct Interval {
+ int start;
+ int end;
+ Interval() : start(0), end(0) {}
+ Interval(int s, int e) : start(s), end(e) {}
+}Interval;
+
+struct compInterval {
+ bool operator()(const Interval &a, const Interval &b) const {
+ return a.start<b.start;
+ }
+};
+
+typedef struct Args {
+ string baseline_filename;
+ vector<string> vcf_filenames;
+}Args;
+
+bool TclapParser(Args & args, int argc, char** argv){
+ string version = "0.9";
+
+ try {
+ std::string desc = "Please cite our paper if you are using this program in your research. \n";
+ TCLAP::CmdLine cmd(desc, ' ', version);
+
+ TCLAP::ValueArg<std::string> arg_baseline_filename("b", "baseline", "VCF file", true, "", "file");
+ TCLAP::MultiArg<std::string> arg_vcf_filenames("v", "vcf_files", "VCF file list", true, "file list");
+
+ cmd.add(arg_vcf_filenames);
+ cmd.add(arg_baseline_filename);
+
+ cmd.parse(argc, argv);
+
+ args.baseline_filename = arg_baseline_filename.getValue();
+ args.vcf_filenames = arg_vcf_filenames.getValue();
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+ abort();
+ }
+ return true;
+}
+
+vector<Interval> merge(vector<Interval> &intervals) {
+ sort(intervals.begin(),intervals.end(),compInterval());
+ vector<Interval> results;
+ for(int i=0; i<intervals.size(); i++) {
+ if(results.empty() || results.back().end < intervals[i].start) // no overlap
+ results.push_back(intervals[i]);
+ else // overlap
+ results.back().end = max(results.back().end, intervals[i].end);
+ }
+ return results;
+}
+
+int ReadWholeGenomeVariant(string filename,
+ vector<vector<Interval>> & interval_list_list,
+ vector<multimap<int, string>> & variant_hash_list,
+ map<string, int> & chrname_index)
+{
+ int total_num = 0;
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return -1;
+ }
+
+ int genotype_index = -1;
+ char genotype_separator = '/';
+ int chr_num = 0;
+ //int genome_sequence_length = genome_sequence.length();
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ //if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#') {
+ continue;
+ }
+ auto columns = split(line, '\t');
+ string chr_name = columns[0];
+
+ if(chrname_index.find(chr_name) == chrname_index.end()){
+ chrname_index[chr_name] = chr_num;
+ chr_num++;
+ interval_list_list.push_back(vector<Interval>());
+ variant_hash_list.push_back(multimap<int, string>());
+ }
+ int chr_index = chrname_index[chr_name];
+
+ auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+ auto ref = columns[3];
+ int end_pos = pos + ref.size();
+ auto alt_line = columns[4];
+
+ bool is_heterozygous_variant = false;
+ bool is_multi_alternatives = false;
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos) {
+ alt_list = split(alt_line, ',');
+ is_multi_alternatives = true;
+ }
+ else {
+ alt_list.push_back(alt_line);
+ }
+
+ int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+ int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+ if(is_multi_alternatives){
+ snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+ snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+ }
+
+ if(snp_ins > 0 || snp_del > 0){
+ // this is an indel
+ int interval_start = pos - 10;
+ int interval_end = pos + 10;
+ interval_list_list[chr_index].push_back(Interval(interval_start, interval_end));
+ }
+ variant_hash_list[chr_index].insert(make_pair(pos, line));
+ total_num++;
+ }
+ vcf_file.close();
+ return total_num;
+}
+
+int FilterComplexVariant(int argc, char* argv[]){
+
+ string input_filename = string(argv[1]);
+ string output_filename = input_filename + ".cv.vcf";
+ vector<vector<Interval>> interval_list_list;
+ vector<multimap<int, string>> variant_hash_list;
+ map<string, int> chrname_index;
+
+ ReadWholeGenomeVariant(input_filename,
+ interval_list_list,
+ variant_hash_list,
+ chrname_index);
+
+ vector<vector<Interval>> merged_intervals_list;
+ for(auto interval_list : interval_list_list){
+ merged_intervals_list.push_back(merge(interval_list));
+ }
+
+ vector<map<int, int>> end_start_list;
+
+ for(auto merged_intervals : merged_intervals_list){
+ map<int, int> end_start;
+ for(auto i : merged_intervals){
+ end_start[i.end] = i.start;
+ }
+ end_start_list.push_back(end_start);
+ }
+
+ vector<unordered_map<int, string>> candidate_variant_hash_list;
+ vector<vector<int>> candidate_variant_pos_list;
+ cout << "filtering candidate variants..." << endl;
+ for(int k = 0; k < variant_hash_list.size(); k++){
+ cout << "filtering candidate variants on chromosome " << k << endl;
+ auto variant_hash = variant_hash_list[k];
+ auto end_start = end_start_list[k];
+ //cout << variant_hash.size() << "," << end_start.size() << endl;
+
+ unordered_map<int, string> candidate_variant_hash;
+ vector<int> candidate_variant_pos;
+
+ for(auto it = variant_hash.begin(); it != variant_hash.end(); ++it){
+ int varp = it->first;
+
+ auto lowit = end_start.lower_bound(varp);
+ if(lowit == end_start.end()) continue;
+ int interval_start = lowit->second;
+ int interval_end = lowit->first;
+ if(varp >= interval_start && varp < interval_end){
+ // candidate variant
+ candidate_variant_hash[varp] = it->second;
+ candidate_variant_pos.push_back(varp);
+ }
+ }
+ candidate_variant_hash_list.push_back(candidate_variant_hash);
+ candidate_variant_pos_list.push_back(candidate_variant_pos);
+ }
+ cout << "filtered all candidate variants." << endl;
+
+ ofstream cv_file;
+ cv_file.open(output_filename);
+
+ cout << "filtering complex variants..." << endl;
+ for(int k = 0; k < variant_hash_list.size(); k++){
+ auto candidate_variant_pos = candidate_variant_pos_list[k];
+ auto candidate_variant_hash = candidate_variant_hash_list[k];
+ cout << "filtering complex variants on chromosome " << k << endl;
+ for(int i = 0; i < candidate_variant_pos.size(); i++){
+ int cur_pos = candidate_variant_pos[i];
+ if(i > 0){
+ int pre_pos = candidate_variant_pos[i-1];
+ if(cur_pos - pre_pos <= 10){
+ cv_file << candidate_variant_hash[cur_pos] << endl;
+ continue;
+ }
+ }
+
+ if(i < candidate_variant_pos.size() - 1){
+ int next_pos = candidate_variant_pos[i+1];
+ if(next_pos - cur_pos <= 10){
+ cv_file << candidate_variant_hash[cur_pos] << endl;
+ continue;
+ }
+ }
+ }
+ }
+ cout << "finished" << endl;
+ cv_file.close();
+}
+
+
+
+int FiltetCandidateVariant(int argc, char* argv[]){
+ Args args;
+ TclapParser(args, argc, argv);
+
+ return 0;
+}
+
+int main(int argc, char* argv[]){
+ return FilterComplexVariant(argc, argv);
+}
\ No newline at end of file
diff --git a/src/filter_hc.cpp b/src/filter_hc.cpp
new file mode 100644
index 0000000..73016d3
--- /dev/null
+++ b/src/filter_hc.cpp
@@ -0,0 +1,158 @@
+#include <tclap/CmdLine.h>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include "util.h"
+
+using namespace std;
+
+typedef struct Args {
+ string bed_filename;
+ vector<string> vcf_filenames;
+ bool keep_outside;
+}Args;
+
+bool TclapParser(Args & args, int argc, char** argv){
+ string version = "0.9";
+
+ try {
+ std::string desc = "Please cite our paper if you are using this program in your research. \n";
+ TCLAP::CmdLine cmd(desc, ' ', version);
+
+ TCLAP::ValueArg<std::string> arg_bed_filename("b", "bedfile", "bedfile", true, "", "file");
+ TCLAP::MultiArg<std::string> arg_vcf_filenames("v", "vcf_files", "VCF file list", true, "file list");
+
+ string keep_variant_outside_string = "Keep variants outside, default keep variants inside. \n";
+ TCLAP::SwitchArg arg_keep_outside("o", "outside", keep_variant_outside_string, cmd, false);
+
+ cmd.add(arg_vcf_filenames);
+ cmd.add(arg_bed_filename);
+
+ cmd.parse(argc, argv);
+
+ args.bed_filename = arg_bed_filename.getValue();
+ args.vcf_filenames = arg_vcf_filenames.getValue();
+ args.keep_outside = arg_keep_outside.getValue();
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+ abort();
+ }
+ return true;
+}
+
+void ReadBedfile(string bed_filename,
+ map<string, int> & chrname_2_index,
+ vector<map<int, int>> & chr_end_start){
+
+ int chr_num = 0;
+
+ ifstream input(bed_filename);
+ if(!input.good()){
+ cout << "[Error] Read bed file error" << endl;
+ return;
+ }
+
+ string line;
+ while( std::getline( input, line ).good() )
+ {
+ if(line[0] == '#') continue;
+ vector<string> columns = split(line, '\t');
+ string chr_name = columns[0];
+ if(chrname_2_index.find(chr_name) == chrname_2_index.end()){
+ chrname_2_index[chr_name] = chr_num;
+ map<int, int> temp;
+ chr_end_start.push_back(temp);
+ chr_num++;
+ }
+ int chr_index = chrname_2_index[chr_name];
+ //cout << line << endl;
+ int startp = stoi(columns[1]);
+ int endp = stoi(columns[2]);
+ chr_end_start[chr_index][endp] = startp;
+ }
+ cout << "finish reading bed file" << endl;
+ return;
+
+}
+
+void FilterVcfFile(string vcf_filename,
+ map<string, int> & chrname_2_index,
+ vector<map<int, int>> & chr_end_start,
+ bool keep_outside){
+
+ string filter_filename = vcf_filename + ".lcr.vcf";
+
+ ifstream input(vcf_filename);
+ if(!input.good()){
+ cout << "[Error] Read vcf file " + vcf_filename + " error" << endl;
+ return;
+ }
+
+ vector<string> output_lines;
+ string line;
+ while( std::getline( input, line ).good() )
+ {
+ if(line[0] == '#'){
+ output_lines.push_back(line);
+ continue;
+ }
+ vector<string> columns = split(line, '\t');
+ string chr_name = columns[0];
+ if(chrname_2_index.find(chr_name) == chrname_2_index.end()){
+ output_lines.push_back(line);
+ continue;
+ }
+ int chr_index = chrname_2_index[chr_name];
+ int varp = stoi(columns[1])-1;
+ map<int, int> & end_start = chr_end_start[chr_index];
+ map<int, int>::iterator itlow, itup;
+
+ itlow = end_start.lower_bound(varp);
+ int startp = itlow->second;
+ int endp = itlow->first;
+ if(varp >= startp && varp< endp){
+ if(keep_outside){
+ continue;
+ }else{
+ output_lines.push_back(line);
+ }
+ }else{
+ // variants are outside of bed region
+ if(!keep_outside){
+ // if you do not want to keep outside variants
+ continue;
+ }else{
+ output_lines.push_back(line);
+ }
+ }
+ }
+
+ ofstream filter_file;
+ filter_file.open(filter_filename);
+ for(auto line: output_lines){
+ filter_file << line << endl;
+ }
+ filter_file.close();
+
+}
+
+int main(int argc, char* argv[]){
+
+ Args args;
+ TclapParser(args, argc, argv);
+
+ vector<map<int, int>> chr_end_start;
+ map<string, int> chrname_2_index;
+
+ ReadBedfile(args.bed_filename, chrname_2_index, chr_end_start);
+
+ vector<string> vcf_filenames = args.vcf_filenames;
+
+ for(auto vcf_filename: vcf_filenames){
+ FilterVcfFile(vcf_filename, chrname_2_index, chr_end_start, args.keep_outside);
+ }
+
+ return 0;
+}
diff --git a/src/makefile b/src/makefile
new file mode 100644
index 0000000..3b23ec7
--- /dev/null
+++ b/src/makefile
@@ -0,0 +1,20 @@
+CXX=g++
+CXXFLAGS=-std=c++11 -pthread -g
+CXXFLAGS2=-I ../include
+CXXFLAGTBB=-ltbb
+
+all: vm-core
+
+vm-core: vm.cpp wholegenome.cpp util.cpp
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+ cp $@ ../$@
+
+filter_hc: filter_hc.cpp util.cpp
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+
+filter_cv: filter_cv.cpp util.cpp
+ $(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+
+clean:
+ rm -f vm-core
+ rm -f *.o
diff --git a/src/removeduplicate.cpp b/src/removeduplicate.cpp
new file mode 100644
index 0000000..fdf9c4b
--- /dev/null
+++ b/src/removeduplicate.cpp
@@ -0,0 +1,456 @@
+#include "removeduplicate.h"
+
+RemoveDuplicate::RemoveDuplicate(int thread_num_):VCF(thread_num_){}
+RemoveDuplicate::~RemoveDuplicate(){}
+
+int RemoveDuplicate::GetThreadIndex(int pos){
+ for(int i = 0; i < pos_boundries.size(); i++){
+ if(pos < pos_boundries[i]){
+ return i;
+ }
+ }
+}
+
+int RemoveDuplicate::ReadVCFWithoutDup(string filename){
+ if(!boundries_decided){
+ cout << "[Error: RemoveDuplicate] ReadVCFWithoutDup can not read vcf file before read genome file" << endl;
+ return -1;
+ }
+
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[Error] RemoveDuplicate::ReadVCFWithoutDup can not open vcf file" << endl;
+ return -1;
+ }
+ int var_num = 0;
+ int nodup_var_num = 0;
+ while(!vcf_file.eof()){
+ string line;
+ getline(vcf_file, line, '\n');
+ if ((int)line.length() <= 1) continue;
+ if (line[0] == '#') continue;
+ auto columns = split(line, '\t');
+ if(chromosome_name == ".") chromosome_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1;
+ string ref = columns[3];
+ string alt = columns[4];
+ string quality = columns[6];
+
+ vector<string> alt_list;
+ if(alt.find(",") != string::npos){
+ continue;
+ // deal with multi alt
+ alt_list = split(alt, ',');
+ }else{
+ alt_list.push_back(alt);
+ }
+
+ //int thread_index = GetThreadIndex(pos);
+
+ char snp_type;
+ for (auto it = alt_list.begin(); it != alt_list.end(); ++it){
+ snp_type = 'S';
+ string a = *it;
+ if((int)ref.length() > (int)alt.length()){
+ snp_type = 'D';
+ }else if((int)ref.length() < (int)alt.length()){
+ snp_type = 'I';
+ }
+ var_num ++;
+ string varid = to_string(pos) + "_" + ref + "_" + a;
+ transform(varid.begin(), varid.end(), varid.begin(), ::toupper);
+ //dout << varid << endl;
+ if(nondup_vcfentry_hash.find(varid) == nondup_vcfentry_hash.end()){
+ nodup_var_num ++;
+ nondup_vcfentry_hash[varid] = line;
+ nondup_pos_snp_map[pos].push_back(SNP(pos, snp_type, ref, a));
+ }
+ }
+ }
+
+ vcf_file.close();
+ return var_num;
+}
+
+void RemoveDuplicate::ClusteringSnps() {
+ //int num = 0;
+ //dout << nondup_pos_snp_map.size() << endl;
+ for (auto it = nondup_pos_snp_map.begin(); it != nondup_pos_snp_map.end(); ++it) {
+ auto & v = it->second;
+ for (int k = 0; k < v.size(); k++) {
+ data_list.push_back(v[k]);
+ }
+ }
+ if (data_list.size() == 0)
+ return;
+ sort(data_list.begin(), data_list.end());
+
+ int cluster_index = 0;
+ int ins_total = 0;
+ int del_total = 0;
+ int c_start = 0;
+ int c_end = 0;
+
+ for (int i = 0; i < data_list.size(); i++) {
+ auto snp = data_list[i];
+ // check if need to separator clusters
+ if (i > 0) {
+ c_end = snp.pos;
+ if(c_end-c_start >= 2){
+ string separator = genome_sequence.substr(c_start, c_end - c_start);
+ int max_change = max(ins_total, del_total);
+ if ((int)separator.length() > 2 * max_change &&
+ ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ cluster_index++;
+ ins_total = 0;
+ del_total = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+
+ if(c_start < snp.pos + (int)snp.ref.length()) c_start = snp.pos + (int)snp.ref.length();
+ // assign snp to cluster
+ cluster_snps_map[cluster_index].push_back(snp);
+ int ref_length = (int)snp.ref.length();
+ int alt_length = (int)snp.alt.length();
+ int diff_length = abs(ref_length - alt_length);
+
+ if (snp.snp_type == 'I') {
+ ins_total += diff_length;
+ }
+ else if (snp.snp_type == 'D') {
+ del_total += diff_length;
+ }
+ }
+}
+
+void RemoveDuplicate::DivisiveHierarchicalClustering(list<vector<SNP> > & snp_clusters){
+ //
+ if(snp_clusters.size() == 0) return;
+ bool flag = true;
+ list<bool> potential_list;
+ for(int i = 0; i < snp_clusters.size(); i++){
+ potential_list.push_back(true);
+ }
+ while(flag){
+ flag = false;
+ int list_size = snp_clusters.size();
+ for(int i = 0; i < list_size; i++){
+ auto front_cluster = snp_clusters.front();
+ auto front_posential = potential_list.front();
+ snp_clusters.pop_front();
+ potential_list.pop_front();
+ if(! front_posential){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(front_posential);
+ continue;
+ }
+
+ int max_start = -1;
+ int max_end = -1;
+ int max_length = -1;
+ int start = front_cluster[0].pos + (int)front_cluster[0].ref.length();
+ // find the largest gap, see if we can separate from that gap
+ for(int k = 0; k < front_cluster.size(); k++){
+ auto snp = front_cluster[k];
+ auto snp_pos = snp.pos;
+ if(max_length < snp_pos - start){
+ max_length = snp_pos - start;
+ max_start = start;
+ max_end = snp_pos;
+ }
+ }
+
+ if(max_length <= 0){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+ int left_ins = 0;
+ int left_del = 0;
+ int right_ins = 0;
+ int right_del = 0;
+ vector<SNP> left_snp_list;
+ vector<SNP> right_snp_list;
+ string separator = genome_sequence.substr(max_start, max_end-max_start);
+ for(int k = 0; k < front_cluster.size(); k++){
+ auto snp = front_cluster[k];
+ int snp_diff = abs((int)snp.ref.length() - (int)snp.alt.length());
+ if(snp.pos <= max_start){
+ if(snp.snp_type == 'I'){
+ left_ins += snp_diff;
+ }else if(snp.snp_type == 'D'){
+ left_del += snp_diff;
+ }
+ left_snp_list.push_back(snp);
+ }else{
+ if(snp.snp_type == 'I'){
+ right_ins += snp_diff;
+ }else if(snp.snp_type == 'D'){
+ right_del += snp_diff;
+ }
+ right_snp_list.push_back(snp);
+ }
+ }
+ //check
+ if(left_snp_list.size() == 0 || right_snp_list.size() == 0){
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+
+ vector<int> change_list = {left_ins, left_del, right_ins, right_del};
+ int max_change = 0;
+ for(int k = 0; k < change_list.size(); k++){
+ if (max_change < change_list[k]) max_change = change_list[k];
+ }
+ if ((int)separator.length() > 2 * max_change &&
+ ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ flag = true;
+ snp_clusters.push_back(left_snp_list);
+ potential_list.push_back(true);
+ snp_clusters.push_back(right_snp_list);
+ potential_list.push_back(true);
+ }else{
+ snp_clusters.push_back(front_cluster);
+ potential_list.push_back(false);
+ continue;
+ }
+ }
+
+ }
+ return;
+}
+
+bool RemoveDuplicate::FindOneMatch(vector<SNP> & snp_list,
+ const string subsequence,
+ int offset,
+ int thread_index)
+{
+ if(snp_list.size() <= 1) return false;
+ unordered_map<string, vector<SNP>> donor_snps;
+ for(int i = 1; i < snp_list.size(); i++){
+ vector<vector<SNP> > combinations = CreateCombinations(snp_list, i);
+ for(int k = 0; k < combinations.size(); k++){
+ vector<SNP> comb = combinations[k];
+ if(CheckVariantOverlap(comb)) continue;
+ string alt_sequence = ModifySequenceBySnpList(subsequence, comb, offset);
+ //dout << alt_sequence << endl;
+ if(donor_snps.find(alt_sequence) != donor_snps.end()){
+ string matching_result = "";
+ matching_result += chromosome_name;
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt = alt_sequence;
+ if(parsimonious_ref == parsimonious_alt){
+ dout << "[Error:RemoveDuplicate::FindOneMatch] in variant, ref == alt";
+ }
+ int min_parsimonious_len = min(parsimonious_ref.size(), parsimonious_alt.size());
+ int chop_left = 0;
+ int chop_right = 0;
+ for(int i = 0; i < min_parsimonious_len; i++){
+ if(toupper(parsimonious_ref[i]) == toupper(parsimonious_alt[i])){
+ chop_left ++;
+ }else{
+ break;
+ }
+ }
+ for(int i = min_parsimonious_len-1; i >= 0; i--){
+ if(toupper(parsimonious_ref[i]) == toupper(parsimonious_alt[i])){
+ chop_right ++;
+ }else{
+ break;
+ }
+ }
+ // 1-based
+ if ((int)parsimonious_ref.length() - chop_left - chop_right == 0 || (int)parsimonious_alt.length() - chop_left - chop_right == 0)
+ chop_left --;
+ matching_result += "\t" + to_string(chop_left + offset + 1);
+
+ parsimonious_ref = parsimonious_ref.substr(chop_left, (int)parsimonious_ref.length() - chop_left - chop_right);
+ parsimonious_alt = parsimonious_alt.substr(chop_left, (int)parsimonious_alt.length() - chop_left - chop_right);
+ matching_result += "\t" + parsimonious_ref + "\t" + parsimonious_alt;
+
+ string set_matching_string = "";
+ for(int m = 0; m < comb.size(); m++){
+ auto m_snp = comb[m];
+ for(auto it = snp_list.begin(); it != snp_list.end(); ++it){
+ auto del_snp = *it;
+ if(m_snp.snp_type == del_snp.snp_type && m_snp.pos == del_snp.pos && m_snp.ref == del_snp.ref && m_snp.alt == del_snp.alt){
+ snp_list.erase(it);
+ break;
+ }
+ }
+ set_matching_string += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+ }
+ matching_result += "\t"+set_matching_string;
+
+ set_matching_string = "";
+ for(int m = 0; m < donor_snps[alt_sequence].size(); m++){
+ auto m_snp = donor_snps[alt_sequence][m];
+ for(auto it = snp_list.begin(); it != snp_list.end(); ++it){
+ auto del_snp = *it;
+ if(m_snp.snp_type == del_snp.snp_type && m_snp.pos == del_snp.pos && m_snp.ref == del_snp.ref && m_snp.alt == del_snp.alt){
+ snp_list.erase(it);
+ break;
+ }
+ }
+ set_matching_string += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+ }
+ matching_result += "\t"+set_matching_string + "\n";
+
+ complex_match_records[thread_index]->push_back(matching_result);
+
+ return true;
+ }else{
+ donor_snps[alt_sequence] = comb;
+ }
+ }
+ }
+ return false;
+}
+
+void RemoveDuplicate::FindMatches(vector<SNP> snp_list, int thread_index){
+ if(snp_list.size() <= 1) return;
+ int min_pos = 0;
+ int max_pos = 0;
+ sort(snp_list.begin(), snp_list.end());
+ min_pos = snp_list[0].pos;
+ for(int i = 0; i < snp_list.size(); i++){
+ int temp_pos = snp_list[i].pos + (int)snp_list[i].ref.length();
+ if(max_pos < temp_pos) max_pos = temp_pos;
+ }
+ min_pos = max(0, min_pos - 1);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length());
+ string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+ while(snp_list.size() > 1 && FindOneMatch(snp_list, subsequence, min_pos, thread_index));
+}
+
+void RemoveDuplicate::ClusteringRemoveDuplicateInThread(int start, int end, int thread_index){
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) == cluster_snps_map.end()) continue;
+ auto & snp_list = cluster_snps_map[cluster_id];
+ if(snp_list.size() <= 1) continue;
+ //dout << snp_list.size() << endl;
+ if(snp_list.size() > 20){
+ //dout << "DivisiveHierarchicalClustering" << endl;
+ list<vector<SNP> > snp_clusters = {snp_list};
+ DivisiveHierarchicalClustering(snp_clusters);
+ for(auto it=snp_clusters.begin(); it != snp_clusters.end(); it++){
+ //FindMatches for snp_clusters[i]
+ FindMatches(*it, thread_index);
+ }
+ }else{
+ //FindMatches
+ FindMatches(snp_list, thread_index);
+ }
+ }
+}
+
+void RemoveDuplicate::ClusteringRemoveDuplicateMultiThread(){
+ int start = cluster_snps_map.begin()->first;
+ int cluster_number = cluster_snps_map.size();
+ int cluster_end_boundary = start + cluster_number;
+ int cluster_step = cluster_number / thread_num;
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+
+ //dout << start << "\t" << end << "\t" << cluster_number << "\t" << cluster_step << endl;
+
+ //initialize vector size, each allocating will have a lock
+ complex_match_records = new vector<string>* [thread_num];
+ for(int j = 0; j < thread_num; j++){
+ complex_match_records[j] = new vector<string>;
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ //threads.push_back(thread(f));
+ //dout << "create new thread" << endl;
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+ variant_number += cluster_snps_map[cluster_id].size();
+ }
+ }
+ threads.push_back(thread(&RemoveDuplicate::ClusteringRemoveDuplicateInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ //dout << start << "\t" << end << "\t" << cluster_number << "\t" << cluster_step << endl;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= cluster_snps_map.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+ variant_number += cluster_snps_map[cluster_id].size();
+ }
+ }
+ ClusteringRemoveDuplicateInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_complex_filename);
+ output_complex_file << "#CHR\tPOS\tREF\tALT\tSet1\tSet2" << endl;
+ for(int i = 0; i < thread_num; i++){
+ for (int j = 0; j < complex_match_records[i]->size(); j++){
+ if(complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos){
+ output_complex_file << complex_match_records[i]->at(j);
+ }
+ }
+ }
+ output_complex_file.close();
+}
+
+void RemoveDuplicate::Deduplicate(string vcf_filename,
+ string genome_filename,
+ bool direct_search,
+ string output_prefix)
+{
+ //dout << output_prefix << endl;
+ output_stat_filename = output_prefix + ".stat";
+ output_simple_filename = output_prefix + ".simple";
+ output_complex_filename = output_prefix + ".complex";
+
+ //------------read genome sequence and decide boundary according to thread number
+ dsptime();
+ dout << " Read genome sequence file... " << endl;
+ ReadGenomeSequence(genome_filename);
+ dsptime();
+ dout << " Finish reading genome sequence file." << endl;
+
+ dsptime();
+ dout << " Read vcf file and remove simple duplications... " << endl;
+ ReadVCFWithoutDup(vcf_filename);
+ dsptime();
+
+ //-------------clustering search
+ dsptime();
+ dout << " Clustering snps ... " << endl;
+ ClusteringSnps();
+ dsptime();
+ dout << " Finish clustering." << endl;
+ dsptime();
+ dout << " Detect complex duplications..." << endl;
+ ClusteringRemoveDuplicateMultiThread();
+ dsptime();
+ dout << " Output complex duplications..." << endl;
+
+
+ return;
+}
diff --git a/src/removeduplicate.h b/src/removeduplicate.h
new file mode 100644
index 0000000..837bc16
--- /dev/null
+++ b/src/removeduplicate.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "vcf.h"
+
+typedef unordered_map<string, string> VCFEntryHash;
+
+class RemoveDuplicate: public VCF
+{
+private:
+ map<int, vector<SNP> > nondup_pos_snp_map;
+ VCFEntryHash nondup_vcfentry_hash; // id is pos_ref_alt with uppercase
+
+ int GetThreadIndex(int pos);
+ int ReadVCFWithoutDup(string filename);
+ void ClusteringRemoveDuplicateInThread(int start, int end, int thread_index);
+ void ClusteringRemoveDuplicateMultiThread();
+ void ClusteringSnps() override;
+ void DivisiveHierarchicalClustering(list<vector<SNP>>& snp_clusters);
+ void FindMatches(vector<SNP> snp_list, int thread_index);
+ bool FindOneMatch(vector<SNP> & snp_list, const string subsequence, int offset, int thread_index);
+
+public:
+ RemoveDuplicate(int thread_num_);
+ ~RemoveDuplicate();
+
+ void Deduplicate(string vcf_filename,
+ string genome_filename,
+ bool direct_search,
+ string output_prefix);
+
+};
diff --git a/src/splitvcf.cpp b/src/splitvcf.cpp
new file mode 100644
index 0000000..2eed461
--- /dev/null
+++ b/src/splitvcf.cpp
@@ -0,0 +1,30 @@
+//#include "stdafx.h"
+#include "splitvcf.h"
+
+SplitVcf::SplitVcf(int argc, char* argv) {
+ version = "0.9";
+
+ try {
+ std::string desc = "split vcf file according to chromosome. \n";
+ TCLAP::CmdLine cmd(desc, ' ', version);
+ //TCLAP::ValueArg<std::string> arg_input_vcf_file("i", "i", "input VCF file", true, "", "file", cmd);
+ TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "", "file", cmd);
+ TCLAP::ValueArg<std::string> arg_genome_list_file("g", "g", "genome list file", true, "", "file", cmd);
+ cmd.parse(argc, argv);
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+ abort();
+ }
+}
+
+SplitVcf::~SplitVcf() {
+
+}
+
+int main(int argc, char* argv[]) {
+ SplitVcf sf = new SplitVcf(argc, argv);
+ sf->Split();
+}
+
diff --git a/src/splitvcf.h b/src/splitvcf.h
new file mode 100644
index 0000000..47af518
--- /dev/null
+++ b/src/splitvcf.h
@@ -0,0 +1,15 @@
+#include <iostream>
+#include "util.h"
+#include <tclap/CmdLine.h>
+
+class SplitVcf
+{
+private:
+ std::string genome_list_filename;
+ std::string vcf_filename;
+
+public:
+ SplitVcf(int argc, char* argv);
+ ~SplitVcf();
+ bool Split();
+};
diff --git a/src/test.py b/src/test.py
new file mode 100644
index 0000000..a4c8bcd
--- /dev/null
+++ b/src/test.py
@@ -0,0 +1 @@
+print 'h'
\ No newline at end of file
diff --git a/src/threadguard.cpp b/src/threadguard.cpp
new file mode 100644
index 0000000..e5a512d
--- /dev/null
+++ b/src/threadguard.cpp
@@ -0,0 +1,9 @@
+#include "threadguard.h"
+
+
+ThreadGuard::~ThreadGuard()
+{
+ if (t.joinable()) {
+ t.join();
+ }
+}
diff --git a/src/threadguard.h b/src/threadguard.h
new file mode 100644
index 0000000..1235e9d
--- /dev/null
+++ b/src/threadguard.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <thread>
+using namespace std;
+
+class ThreadGuard
+{
+ thread & t;
+public:
+ explicit ThreadGuard(thread& t_) : t(t_) {
+
+ }
+ ~ThreadGuard();
+ ThreadGuard(ThreadGuard const &) = delete;
+ ThreadGuard& operator=(ThreadGuard const&) = delete;
+};
+
diff --git a/src/util.cpp b/src/util.cpp
new file mode 100644
index 0000000..42021d9
--- /dev/null
+++ b/src/util.cpp
@@ -0,0 +1,20 @@
+#include "util.h"
+
+/*split function*/
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+ std::stringstream ss(s);
+ std::string item;
+ while (std::getline(ss, item, delim)) {
+ if (!item.empty()) {
+ elems.push_back(item);
+ }
+ }
+ return elems;
+}
+
+/*This split function only support char as delim, string as delim please boost split function*/
+std::vector<std::string> split(const std::string &s, char delim) {
+ std::vector<std::string> elems;
+ split(s, delim, elems);
+ return elems;
+}
\ No newline at end of file
diff --git a/src/util.h b/src/util.h
new file mode 100644
index 0000000..4dcb105
--- /dev/null
+++ b/src/util.h
@@ -0,0 +1,54 @@
+//#ifndef UTILITIES_H
+//#define UTILITIES_H
+#pragma once
+
+#define DEBUG
+
+#ifdef DEBUG
+#define dout cout
+#else
+#define dout 0 && cout
+#endif
+
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <cstring>
+#include <sys/stat.h>
+#include <cassert>
+#include <cstdlib>
+
+using namespace std;
+
+/*split function*/
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems);
+
+/*This split function only support char as delim, string as delim please boost split function*/
+std::vector<std::string> split(const std::string &s, char delim);
+
+inline bool FileExists (const std::string& name) {
+ struct stat buffer;
+ return (stat (name.c_str(), &buffer) == 0);
+}
+
+template <typename T>
+inline
+string ToString(const T & number){
+ string String = static_cast<ostringstream*>( &(ostringstream() << number) )->str();
+ return String;
+}
+
+inline void dsptime()
+{
+ time_t nowtime;
+ //nowtime = time(NULL); //get int time number
+ time(&nowtime); // get current time
+ struct tm * ptm=localtime(&nowtime); //convert time to local time
+ cout << ptm->tm_mon+1 << "/" << ptm->tm_mday << "/"<< ptm->tm_year+1900 << "," ;
+ cout << ptm->tm_hour << ":" << ptm->tm_min << ":" << ptm->tm_sec <<" ";
+}
+
+//#endif
diff --git a/src/vcf.cpp b/src/vcf.cpp
new file mode 100644
index 0000000..a510bcd
--- /dev/null
+++ b/src/vcf.cpp
@@ -0,0 +1,1230 @@
+#include "vcf.h"
+
+
+bool operator <(const SNP& x, const SNP& y) {
+ return x.pos < y.pos;
+}
+
+bool operator ==(const SNP& x, const SNP& y) {
+ if (x.pos == y.pos && x.snp_type == y.snp_type && x.alt == y.alt && x.genotype == y.genotype) {
+ return true;
+ }
+ return false;
+}
+
+VCF::VCF(int thread_num_)
+{
+ debug_f = 0;
+ genome_sequence = "";
+ boundries_decided = false;
+ clustering_search = false;
+ match_genotype = true;
+ if (thread_num_ <= 0) {
+ thread_num = 1;
+ }
+ else {
+ thread_num = min(thread_num_, (int)thread::hardware_concurrency());
+ }
+ dout << "VCF() Thread Number: " << thread_num << endl;
+ chromosome_name = ".";
+}
+
+VCF::~VCF()
+{
+}
+
+
+// protected
+bool VCF::NormalizeSnp(int pos, string ref, string alt, string & parsimonious_ref, string & parsimonious_alt) {
+ parsimonious_ref = ref;
+ parsimonious_alt = alt;
+ int left_index = pos;
+ if (genome_sequence.size() == 0) return false;
+ //if (parsimonious_ref.size() == 1 || parsimonious_alt.size() == 1) return true;
+ if (toupper(genome_sequence[left_index]) != toupper(parsimonious_ref[0])) {
+ dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+ return false;
+ }
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt.back())) {
+ if((parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) || left_index > 0){
+ parsimonious_ref.pop_back();
+ parsimonious_alt.pop_back();
+ change_in_allels = true;
+ }
+ else {
+ return false;
+ }
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt.length() == 0) {
+ left_index--;
+ char left_char = genome_sequence[left_index];
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt = left_char + parsimonious_alt;
+ }
+ }
+ while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt[0]) && parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt.erase(0, 1);
+ }
+ //if(parsimonious_ref != ref){
+ // cout << ref << "," << alt << "," << parsimonious_ref << "," << parsimonious_alt << endl;
+ //}
+ return true;
+}
+
+// private
+void VCF::ReadVCF(string filename, SnpHash & pos_2_snp) {
+ if (!boundries_decided) {
+ dout << "[Error] VCF::ReadVCF cannot read vcf file before read genome file" << endl;
+ return;
+ }
+
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return;
+ }
+
+ if(normalization){
+ dout << "normalize while read" << endl;
+ }
+ string previous_line;
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#'){
+ if(line[1] == '#') continue;
+ auto head_names = split(line, '\t');
+ if(match_genotype && head_names.size() < 10){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tVCF file name " << filename << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching module." << endl;
+ match_genotype = false;
+ }
+ continue;
+ }
+ auto columns = split(line, '\t');
+ if(match_genotype && columns.size() < 10){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tskip current variant " << filename << endl;
+ continue;
+ }
+ if(chromosome_name == ".") chromosome_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1;
+ auto ref = columns[3];
+ auto alt_line = columns[4];
+ auto quality = columns[6];
+
+ if (ref == ".") ref = "";
+ if (alt_line == ".") alt_line = "";
+ //decide which thread to use
+ int thread_index = 0;
+ for (int i = 0; i < pos_boundries.size(); i++) {
+ if (pos < pos_boundries[i]) {
+ thread_index = i;
+ break;
+ }
+ }
+
+ int genotype_index = -1;
+ string genotype = "1/1";
+ vector<string> genotype_columns;
+
+ if (match_genotype){
+ auto formats = split(columns[8], ':');
+ for(int i = 0; i < formats.size(); i++){
+ if(formats[i] == "GT"){
+ genotype_index = i;
+ break;
+ }
+ }
+ if (genotype_index < 0) {
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tskip current variant " << filename << endl;
+ continue;
+ }
+ auto additionals = split(columns[9], ':');
+ genotype = additionals[genotype_index];
+
+ if(genotype.find("/") != std::string::npos){
+ genotype_columns = split(genotype, '/');
+ }else if(genotype.find("|") != std::string::npos){
+ genotype_columns = split(genotype, '|');
+ }else{
+ cout << "[VarMatch] Error: Unrecognized Genotype: " << genotype << endl;
+ continue;
+ }
+ // normalize format of genotype: sorted, separated by |
+ if(genotype_columns.size() != 2){
+ cout << "[VarMatch] Warning Unrecognized Genotype: " << genotype << endl;
+ //cout << genotype_columns.size() << endl;
+ //dout << line << endl;
+ }else{
+ sort(genotype_columns.begin(), genotype_columns.end());
+ genotype = genotype_columns[0]+"|"+genotype_columns[1];
+ }
+ }
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos){
+ alt_list = split(alt_line, ',');
+ }else{
+ alt_list.push_back(alt_line);
+ }
+ if(!match_genotype){
+ for(auto alt_it = alt_list.begin(); alt_it != alt_list.end(); ++alt_it){
+ string alt = *alt_it;
+ char snp_type = 'S';
+ if ((int)ref.length() > (int)alt.length()) {
+ snp_type = 'D';
+ }
+ else if ((int)ref.length() < (int)alt.length()) {
+ snp_type = 'I';
+ }
+ if (normalization) {
+ string norm_ref, norm_alt;
+ NormalizeSnp(pos, ref, alt, norm_ref, norm_alt);
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type, norm_ref, norm_alt, genotype));
+ }
+ else {
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type, ref, alt, genotype));
+ }
+ }
+ }else{
+ //append variants according to genotype
+ if(genotype == "0|0") continue;
+ vector<char> snp_type_list;
+ for(int i = 0; i < alt_list.size(); i++){
+ string alt = alt_list[i];
+ char snp_type = 'S';
+ if ((int)ref.length() > (int)alt.length()) {
+ snp_type = 'D';
+ }
+ else if ((int)ref.length() < (int)alt.length()) {
+ snp_type = 'I';
+ }
+ snp_type_list.push_back(snp_type);
+ }
+
+ if (alt_list.size() > 1) {
+ haplotype_matching_check[thread_index][pos] = 0;
+ }
+
+ int genotype_val = atoi(genotype_columns[0].c_str()) - 1;
+
+ if(genotype_val >= 0){ // if genotype == -1, it is reference which does not need to be added
+ if (genotype_val >= alt_list.size()) {
+ cout << "[VarMatch] Warning: Unrecognized Genotype. " << genotype_val << endl;
+ }
+ if (normalization) {
+ string norm_ref, norm_alt;
+ NormalizeSnp(pos, ref, alt_list[genotype_val], norm_ref, norm_alt);
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], norm_ref, norm_alt, genotype));
+ }
+ else {
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], ref, alt_list[genotype_val], genotype));
+ }
+ }
+ if(genotype_columns[0] != genotype_columns[1]){
+ // add another alt, one genotype corresponding to one alt
+ genotype_val = atoi(genotype_columns[1].c_str()) - 1;
+
+ if(genotype_val >= 0){
+ if (genotype_val >= alt_list.size()) {
+ cout << "[VarMatch] Warning: Unrecognized Genotype[2]. " << genotype_val << endl;
+ }
+ if (normalization) {
+ string norm_ref, norm_alt;
+ NormalizeSnp(pos, ref, alt_list[genotype_val], norm_ref, norm_alt);
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], norm_ref, norm_alt, genotype));
+ }
+ else {
+ pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], ref, alt_list[genotype_val], genotype));
+ }
+ }
+ }
+ }
+ }
+ vcf_file.close();
+ return;
+}
+
+// protected
+void VCF::ReadGenomeSequence(string filename) {
+ ifstream genome_file;
+ genome_file.open(filename.c_str());
+ if (!genome_file.good()) {
+ cout << "[VarMatch] can not open FASTA file: ";
+ cout << filename << endl;
+ return;
+ }
+
+ genome_sequence = "";
+
+ while(!genome_file.eof()) {
+ string line;
+ getline(genome_file, line, '\n');
+ if ((int)line.length() <= 1) continue;
+ if (line[0] == '>') continue;
+ genome_sequence += line;
+ }
+ genome_file.close();
+ // boundries can get after knowing genome sequence.
+ DecideBoundaries();
+ return;
+}
+
+// protected
+void VCF::DecideBoundaries() {
+ int genome_size = genome_sequence.size();
+
+ int distance = genome_size / thread_num;
+ for (int i = 0; i < thread_num - 1; i++) {
+ pos_boundries.push_back((i + 1)*distance);
+ }
+ pos_boundries.push_back(genome_size);
+
+ // initialize two for copy
+ unordered_map<int, vector<SNP> > ref_h;
+ unordered_map<int, vector<SNP> > que_h;
+ map<int, vector<SNP> > ref_m;
+ map<int, vector<SNP> > que_m;
+
+ for (int i = 0; i < thread_num; i++) {
+ refpos_2_snp.push_back(ref_h);
+ querypos_2_snp.push_back(que_h);
+ refpos_snp_map.push_back(ref_m);
+ querypos_snp_map.push_back(que_m);
+ }
+
+ boundries_decided = true;
+
+}
+
+// private
+void VCF::ReadRefVCF(string filename) {
+ ReadVCF(filename, this->refpos_2_snp);
+}
+
+// private
+void VCF::ReadQueryVCF(string filename) {
+ ReadVCF(filename, this->querypos_2_snp);
+}
+
+// protected
+bool VCF::CompareSnps(SNP r, SNP q) {
+ if(r.pos != q.pos) return false;
+
+ // directly match genotype
+ if(match_genotype && r.genotype != q.genotype) return false;
+ auto ref_ref = r.ref;
+ transform(ref_ref.begin(), ref_ref.end(), ref_ref.begin(), ::toupper);
+ auto ref_alt = r.alt;
+ transform(ref_alt.begin(), ref_alt.end(), ref_alt.begin(), ::toupper);
+ auto que_ref = q.ref;
+ transform(que_ref.begin(), que_ref.end(), que_ref.begin(), ::toupper);
+ auto que_alt = q.alt;
+ transform(que_alt.begin(), que_alt.end(), que_alt.begin(), ::toupper);
+ if (ref_ref == que_ref && ref_alt == que_alt) return true;
+ return false;
+}
+
+//private
+void VCF::DirectSearchInThread(unordered_map<int, vector<SNP> > & ref_snps, unordered_map<int, vector<SNP> > & query_snps, int thread_index) {
+ // handle heterozygous variants
+ auto rit = ref_snps.begin();
+ auto rend = ref_snps.end();
+ for (; rit != rend;) {
+ auto r_pos = rit->first;
+ auto & r_snps = rit->second;
+ auto qit = query_snps.find(r_pos);
+ if (qit != query_snps.end()) {
+ auto & q_snps = qit->second;
+
+ for (auto r_snp_it = r_snps.begin(); r_snp_it != r_snps.end(); ) {
+ bool matched_r_snp = false;
+ for (auto q_snp_it = q_snps.begin(); q_snp_it != q_snps.end(); ) {
+ if (CompareSnps(*r_snp_it, *q_snp_it)) {
+ auto temp_snp = *r_snp_it;
+ string matching_result = chromosome_name + '\t' + to_string(temp_snp.pos+1) + "\t" + temp_snp.ref + "\t" + temp_snp.alt;
+ direct_match_records[thread_index]->push_back(matching_result);
+ matched_r_snp = true;
+ q_snps.erase(q_snp_it);
+ break;
+ }
+ else {
+ ++q_snp_it;
+ }
+ }
+ if (matched_r_snp) {
+ r_snps.erase(r_snp_it);
+ }
+ else {
+ ++r_snp_it;
+ }
+ }
+ if (r_snps.size() == 0) {
+ rit = ref_snps.erase(rit);
+ }
+ else {
+ ++rit;
+ }
+ if (q_snps.size() == 0) {
+ query_snps.erase(qit);
+ }
+ }else{
+ ++rit;
+ }
+ }
+}
+
+// directly match by position
+// private
+void VCF::DirectSearchMultiThread() {
+
+ direct_match_records = new vector<string>* [thread_num];
+ for(int j = 0; j < thread_num; j++){
+ direct_match_records[j] = new vector<string>;
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ int i = 0;
+ for (; i < thread_num - 1; i++) {
+ threads.push_back( thread(&VCF::DirectSearchInThread, this, ref(refpos_2_snp[i]), ref(querypos_2_snp[i]), i));
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ DirectSearchInThread(refpos_2_snp[i], querypos_2_snp[i],i);
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ threads.clear();
+
+ ofstream output_simple_file;
+ output_simple_file.open(output_simple_filename);
+ output_simple_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_simple_file << "##VCF2:" << que_vcf_filename << endl;
+ output_simple_file << "#CHROM\tPOS\tREF\tALT" << endl;
+ for(int i = 0; i < thread_num; i++){
+ for (int j = 0; j < direct_match_records[i]->size(); j++){
+ output_simple_file << direct_match_records[i]->at(j) << endl;
+ }
+ }
+ output_simple_file.close();
+ for(int j = 0; j < thread_num; j++){
+ delete direct_match_records[j];
+ }
+ delete [] direct_match_records;
+
+}
+
+// protected
+string VCF::ModifySequenceBySnp(const string sequence, SNP s, int offset) {
+ string result = "";
+ int snp_pos = s.pos - offset;
+ int snp_end = snp_pos + (int)s.ref.length();
+ if(snp_end > (int)sequence.length()){
+ dout << "[Error] snp end greater than sequence length" << endl;
+ }
+ result += sequence.substr(0, snp_pos);
+ result += s.alt;
+ result += sequence.substr(snp_end, sequence.length() - snp_end);
+ transform(result.begin(), result.end(), result.begin(), ::toupper);
+ return result;
+}
+
+// protected
+string VCF::ModifySequenceBySnpList(const string sequence, vector<SNP> s, int offset) {
+ string result = sequence;
+ int start_pos = 0;
+ if(s.size() == 1){
+ return ModifySequenceBySnp(sequence, s[0], offset);
+ }
+ sort(s.begin(), s.end());
+ for (int i = s.size()-1; i >= 0; i--) {
+ int snp_pos = s[i].pos - offset;
+ int snp_end = snp_pos + (int)s[i].ref.length();
+ string snp_alt = s[i].alt;
+ int result_length = (int)result.length();
+ if(snp_pos > result_length || snp_end > result_length){
+ result = sequence;
+ transform(result.begin(), result.end(), result.begin(), ::toupper);
+ return result;
+ }
+ result = result.substr(0, snp_pos) + s[i].alt + result.substr(snp_end, result_length-snp_end);
+ }
+ transform(result.begin(), result.end(), result.begin(), ::toupper);
+ return result;
+}
+
+// protected
+bool VCF::CheckVariantOverlap(vector<SNP> snp_list){
+ if (snp_list.size() <= 1) return false;
+ int previous_ends = -1;
+ for(int i = 0; i < snp_list.size(); i++){
+ if(snp_list[i].pos < previous_ends) return true;
+ if( previous_ends < snp_list[i].pos + (int)snp_list[i].ref.length()){
+ previous_ends = snp_list[i].pos + (int)snp_list[i].ref.length();
+ }
+ }
+ return false;
+}
+
+void f(){
+ this_thread::sleep_for(chrono::seconds(2));
+ cout << "Hello World" << endl;
+}
+
+// protected
+bool VCF::CheckTandemRepeat(string sequence, int unit_threshold) {
+ int sequence_length = (int)sequence.length();
+ //cout << sequence_length << "," << unit_threshold << endl;
+ if(sequence_length == 1) return true;
+ transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+ int end_index = sequence_length / 2 + 1;
+ bool final_checking = false;
+ int repeat_threshold = min(end_index-1, unit_threshold);
+ for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+ bool is_tandem_repeat = true;
+ int repeat_time = 1;
+ string repeat_region = sequence.substr(0, repeat_length);
+ int start_position = repeat_length;
+ while (start_position < sequence_length) {
+ if (start_position + repeat_length > sequence_length)
+ break;
+ string matching_region = sequence.substr(start_position, repeat_length);
+ if (matching_region != repeat_region) {
+ is_tandem_repeat = false;
+ break;
+ }
+ start_position += repeat_length;
+ repeat_time ++;
+ }
+ if (is_tandem_repeat && repeat_time > 1) {
+ final_checking = true;
+ break;
+ }
+ }
+ return final_checking;
+}
+
+/*
+ clustering snps
+ algorithm description, please refer to paper method
+*/
+// protected
+void VCF::ClusteringSnps() {
+ // handle heterozygous snps
+ for (int i = 0; i < refpos_2_snp.size(); i++) {
+ auto & m = refpos_2_snp[i];
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ auto & v = it->second;
+ for (int k = 0; k < v.size(); k++) {
+ if (v[k].flag != 1) {
+ v[k].flag = 1;
+ }
+ data_list.push_back(v[k]);
+ }
+ }
+ }
+ for (int i = 0; i < querypos_2_snp.size(); i++) {
+ auto & m = querypos_2_snp[i];
+ for (auto it = m.begin(); it != m.end(); ++it) {
+ auto & v = it->second;
+ for (int k = 0; k < v.size(); k++) {
+ v[k].flag = -1;
+ data_list.push_back(v[k]);
+ }
+ }
+ }
+
+ if (data_list.size() == 0)
+ return;
+
+ sort(data_list.begin(), data_list.end());
+
+ int cluster_index = 0;
+ int ins_ref = 0;
+ int del_ref = 0;
+ int ins_que = 0;
+ int del_que = 0;
+ int c_start = 0;
+ int c_end = 0;
+
+ for (int i = 0; i < data_list.size(); i++) {
+ auto snp = data_list[i];
+ // check if need to separator clusters
+ if (i > 0) {
+ c_end = snp.pos;
+ if(c_end-c_start >= 2){
+ string separator = genome_sequence.substr(c_start, c_end - c_start);
+ int max_change = max(ins_ref + del_que, ins_que + del_ref);
+ if ((int)(separator.length()) > 2 * max_change &&
+ ((int)(separator.length()) > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ cluster_index++;
+ ins_ref = 0;
+ del_ref = 0;
+ ins_que = 0;
+ del_que = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ if(c_start < snp.pos + (int)(snp.ref.length())) c_start = snp.pos + (int)(snp.ref.length());
+
+ // assign snp to cluster
+ cluster_snps_map[cluster_index].push_back(snp);
+ int ref_length = (int)(snp.ref.length());
+ int alt_length = (int)(snp.alt.length());
+ int diff_length = abs(ref_length - alt_length);
+ if (snp.flag == 1) {
+ if (snp.snp_type == 'I') {
+ ins_ref += diff_length;
+ }
+ else if (snp.snp_type == 'D') {
+ del_ref += diff_length;
+ }
+ }
+ else {
+ if (snp.snp_type == 'I') {
+ ins_que += diff_length;
+ }
+ else if (snp.snp_type == 'D') {
+ del_que += diff_length;
+ }
+ }
+ }
+}
+
+// protected
+bool VCF::MatchSnpListsWithWeight(vector<SNP> & ref_snp_list,
+ vector<SNP> & query_snp_list,
+ vector<SNP> & mixed_list,
+ const string subsequence,
+ int offset,
+ int thread_index)
+{
+ // handle heterozygous snps
+ map<string, vector<SNP> > ref_choice_snps;
+ sort(mixed_list.begin(), mixed_list.end());
+
+ for (int i = ref_snp_list.size(); i >= 1; i--) {
+ vector<vector<SNP> > combinations = CreateCombinations(ref_snp_list, i);
+ for (int k = 0; k < combinations.size(); k++) {
+ auto c = combinations[k];
+ if (CheckVariantOverlap(c)) continue;
+ string ref_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+ ref_choice_snps[ref_sequence] = c;
+ }
+ }
+ string best_match;
+ int best_score = 0;
+ vector<SNP> best_ref_variants;
+ vector<SNP> best_alt_variants;
+ for (int i = query_snp_list.size(); i >= 1; i--) {
+ vector<vector<SNP> > combinations = CreateCombinations(query_snp_list, i);
+ for (int k = 0; k < combinations.size(); k++) {
+ auto c = combinations[k];
+ if (CheckVariantOverlap(c)) continue;
+ string que_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+ if (ref_choice_snps.find(que_sequence) != ref_choice_snps.end()) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// protected
+bool VCF::MatchSnpLists(vector<SNP> & ref_snp_list,
+ vector<SNP> & query_snp_list,
+ vector<SNP> & mixed_list,
+ const string subsequence,
+ int offset,
+ int thread_index)
+{
+ // handle heterozygous snps
+ map<string, vector<SNP> > ref_choice_snps;
+ sort(mixed_list.begin(), mixed_list.end());
+
+ for (int i = ref_snp_list.size(); i >= 1; i--) {
+ vector<vector<SNP> > combinations = CreateCombinations(ref_snp_list, i);
+ for (int k = 0; k < combinations.size(); k++) {
+ auto c = combinations[k];
+ if(CheckVariantOverlap(c)) continue;
+ string ref_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+ ref_choice_snps[ref_sequence] = c;
+ }
+ }
+ for (int i = query_snp_list.size(); i >= 1; i--) {
+ vector<vector<SNP> > combinations = CreateCombinations(query_snp_list, i);
+ for (int k = 0; k < combinations.size(); k++) {
+ auto c = combinations[k];
+ if(CheckVariantOverlap(c)) continue;
+ string que_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+ if (ref_choice_snps.find(que_sequence) != ref_choice_snps.end()) {
+ // delete all matched
+ auto r = ref_choice_snps[que_sequence];
+ sort(r.begin(), r.end());
+ string matching_result = "";
+ matching_result += chromosome_name;
+
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt = que_sequence;
+ if(parsimonious_ref == parsimonious_alt){
+ dout << "[Error] in variant, ref == alt";
+ }
+ int min_parsimonious_len = min(parsimonious_ref.size(), parsimonious_alt.size());
+ // normalize
+ int left_index = offset;
+ if (toupper(genome_sequence[left_index]) != toupper(subsequence[0])) {
+ dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+ }
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt.back())) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt.back() > 1) || left_index > 0) {
+ parsimonious_ref.pop_back();
+ parsimonious_alt.pop_back();
+ change_in_allels = true;
+ }
+ else {
+ return false;
+ }
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt.length() == 0) {
+ left_index--;
+ char left_char = genome_sequence[left_index];
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt = left_char + parsimonious_alt;
+ }
+ }
+ while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt[0]) && parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt.erase(0, 1);
+ }
+
+ matching_result += "\t" + parsimonious_ref + "\t" + parsimonious_alt;
+
+ string ref_matching_variants = "";
+
+ for (int m = 0; m < r.size(); m++) {
+ SNP r_snp = r[m];
+ for (auto n = mixed_list.begin(); n != mixed_list.end(); n++) {
+ SNP m_snp = *n;
+ if (m_snp.pos == r_snp.pos &&
+ m_snp.ref == r_snp.ref &&
+ m_snp.alt == r_snp.alt &&
+ m_snp.flag == r_snp.flag)
+ {
+ mixed_list.erase(n);
+ break;
+ }
+ }
+ for (auto n = ref_snp_list.begin(); n != ref_snp_list.end(); n++){
+ SNP m_snp = *n;
+ if (m_snp.pos == r_snp.pos &&
+ m_snp.ref == r_snp.ref &&
+ m_snp.alt == r_snp.alt &&
+ m_snp.flag == r_snp.flag)
+ {
+ // 1-based
+ ref_matching_variants += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+ ref_snp_list.erase(n);
+ break;
+ }
+ }
+ }
+ matching_result += "\t" + ref_matching_variants;
+ string que_matching_variants = "";
+ sort(c.begin(), c.end());
+ for (int m = 0; m < c.size(); m++) {
+ SNP q_snp = c[m];
+ for (auto n = mixed_list.begin(); n != mixed_list.end(); n++) {
+ SNP m_snp = *n;
+ if (m_snp.pos == q_snp.pos &&
+ m_snp.ref == q_snp.ref &&
+ m_snp.alt == q_snp.alt &&
+ m_snp.flag == q_snp.flag)
+ {
+ mixed_list.erase(n);
+ break;
+ }
+ }
+ for (auto n = query_snp_list.begin(); n != query_snp_list.end(); n++){
+ SNP m_snp = *n;
+ if (m_snp.pos == q_snp.pos &&
+ m_snp.ref == q_snp.ref &&
+ m_snp.alt == q_snp.alt &&
+ m_snp.flag == q_snp.flag)
+ {
+ // 1-based
+ que_matching_variants += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+ query_snp_list.erase(n);
+ break;
+ }
+ }
+ }
+ matching_result += "\t" + que_matching_variants + "\n";
+ complex_match_records[thread_index]->push_back(matching_result);
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// private
+void VCF::ClusteringSearchInThread(int start, int end, int thread_index) {
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+ auto & snp_list = cluster_snps_map[cluster_id];
+ vector<SNP> candidate_ref_snps;
+ vector<SNP> candidate_que_snps;
+ vector<SNP> candidate_snps;
+ int min_pos = std::numeric_limits<int>::max();
+ int max_pos = 0;
+ for (int i = 0; i < snp_list.size(); i++) {
+ auto s = snp_list[i];
+ if (s.flag == 1) {
+ candidate_ref_snps.push_back(s);
+ }
+ else if(s.flag == -1) {
+ candidate_que_snps.push_back(s);
+ }
+ candidate_snps.push_back(s);
+ if (min_pos > s.pos) min_pos = s.pos;
+ if (max_pos < s.pos + (int)(s.ref.length())) max_pos = s.pos + (int)(s.ref.length());
+ }
+
+ min_pos = max(0, min_pos - 1);
+ max_pos = min(max_pos + 1, (int)genome_sequence.length());
+ string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+
+ if (candidate_ref_snps.size() == 0 || candidate_que_snps.size() == 0) continue;
+ if (candidate_ref_snps.size() <= 1 && candidate_que_snps.size() <= 1) continue;
+ if(candidate_ref_snps.size() > 10 || candidate_que_snps.size() > 10){
+ vector<SNP> cluster_ref_snps;
+ vector<SNP> cluster_que_snps;
+ int ins_ref = 0;
+ int del_ref = 0;
+ int ins_que = 0;
+ int del_que = 0;
+ int c_start = std::numeric_limits<int>::max();
+ int c_end = std::numeric_limits<int>::max();
+ for(int i = 0; i < candidate_snps.size(); i++){
+ candidate_snps[i].pos += (int)candidate_snps[i].ref.length();
+ }
+
+ sort(candidate_snps.begin(), candidate_snps.end());
+
+ for (int i = candidate_snps.size()-1; i >= 0; i--) {
+ auto snp = candidate_snps[i];
+ // check if need to separator clusters
+ if (i < candidate_snps.size() - 1) {
+ int c_start = snp.pos;
+ if(c_start < c_end){
+ string separator = genome_sequence.substr(c_start, c_end - c_start);
+ int max_change = max(ins_ref + del_que, ins_que + del_ref);
+ if ((int)separator.length() > 2 * max_change && !CheckTandemRepeat(separator, max_change))
+ {
+ while(cluster_ref_snps.size() > 0 &&
+ cluster_que_snps.size() > 0 &&
+ MatchSnpLists(cluster_ref_snps, cluster_que_snps, snp_list, subsequence, min_pos, thread_index));
+ cluster_ref_snps.clear();
+ cluster_que_snps.clear();
+ ins_ref = 0;
+ del_ref = 0;
+ ins_que = 0;
+ del_que = 0;
+ }
+ }
+ }
+
+ if(c_end > snp.pos- (int)snp.ref.length()) c_end = snp.pos - (int)snp.ref.length();
+ // assign snp to cluster
+ snp.pos -= (int)snp.ref.length();
+ if(snp.flag == 1){
+ cluster_ref_snps.push_back(snp);
+ }else{
+ cluster_que_snps.push_back(snp);
+ }
+ int ref_length = (int)snp.ref.length();
+ int alt_length = (int)snp.alt.length();
+ int diff_length = abs(ref_length - alt_length);
+ if (snp.flag == 1) {
+ if (snp.snp_type == 'I') {
+ ins_ref += diff_length;
+ }
+ else if (snp.snp_type == 'D') {
+ del_ref += diff_length;
+ }
+ }
+ else {
+ if (snp.snp_type == 'I') {
+ ins_que += diff_length;
+ }
+ else if (snp.snp_type == 'D') {
+ del_que += diff_length;
+ }
+ }
+ }
+
+ //if separating cluster does not work, try heuristic, if still not work, discard this cluster
+ if(cluster_ref_snps.size() > 20 || cluster_que_snps.size() > 20){
+ // final check by variant length, if not applicable, skip it and give a warning.
+ if (cluster_ref_snps.size() > cluster_que_snps.size()){
+
+ int ref_sum_del_len = 0;
+ int ref_sum_ins_len = 0;
+ for(int j = 0; j < cluster_ref_snps.size(); j++){
+ int len_change = cluster_ref_snps[j].ref.size() - cluster_ref_snps[j].alt.size();
+ if (len_change > 0){
+ ref_sum_del_len += len_change;
+ }else if(len_change < 0){
+ ref_sum_ins_len -= len_change;
+ }
+ }
+ bool skip_flag = false;
+ for(int j = 0; j < cluster_que_snps.size(); j++){
+ int len_change = cluster_que_snps[j].ref.size() - cluster_que_snps[j].alt.size();
+ if(len_change > 0){
+ if (ref_sum_del_len < len_change){
+ skip_flag = true;
+ break;
+ }
+ }else if(len_change < 0){
+ if (ref_sum_ins_len < len_change * -1){
+ skip_flag = true;
+ break;
+ }
+ }
+ }
+ if (skip_flag) continue;
+ }else{
+ int que_sum_del_len = 0;
+ int que_sum_ins_len = 0;
+ for(int j = 0; j < cluster_que_snps.size(); j++){
+ int len_change = cluster_que_snps[j].ref.size() - cluster_que_snps[j].alt.size();
+ if (len_change > 0){
+ que_sum_del_len += len_change;
+ }else if(len_change < 0){
+ que_sum_ins_len -= len_change;
+ }
+ }
+ bool skip_flag = false;
+ for(int j = 0; j < cluster_ref_snps.size(); j++){
+ int len_change = cluster_ref_snps[j].ref.size() - cluster_ref_snps[j].alt.size();
+ if(len_change > 0){
+ if (que_sum_del_len < len_change){
+ skip_flag = true;
+ break;
+ }
+ }else if(len_change < 0){
+ if (que_sum_ins_len < len_change * -1){
+ skip_flag = true;
+ break;
+ }
+ }
+ }
+ if(skip_flag) continue;
+
+ }
+ cout << "[Warning] large cluster found, skip it." << endl;
+ continue;
+ }
+
+ while(cluster_ref_snps.size() > 0 &&
+ cluster_que_snps.size() > 0 &&
+ MatchSnpLists(cluster_ref_snps, cluster_que_snps, snp_list, subsequence, min_pos, thread_index));
+
+ }
+ else
+ {
+ while(candidate_ref_snps.size() > 0 &&
+ candidate_que_snps.size() > 0 &&
+ MatchSnpLists(candidate_ref_snps, candidate_que_snps, snp_list, subsequence, min_pos, thread_index));
+ }
+ }
+ else {
+ break;
+ }
+ }
+}
+
+// match by cluster
+// private
+void VCF::ClusteringSearchMultiThread() {
+ clustering_search = true;
+ int start = cluster_snps_map.begin()->first;
+ int cluster_number = cluster_snps_map.size();
+ int cluster_end_boundary = start + cluster_number;
+ int cluster_step = cluster_number / thread_num;
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+ //initialize vector size, each allocating will have a lock
+ complex_match_records = new vector<string>* [thread_num];
+ for(int j = 0; j < thread_num; j++){
+ complex_match_records[j] = new vector<string>;
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+ variant_number += cluster_snps_map[cluster_id].size();
+ }
+ }
+ threads.push_back(thread(&VCF::ClusteringSearchInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= cluster_snps_map.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ int variant_number = 0;
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+ variant_number += cluster_snps_map[cluster_id].size();
+ }
+ }
+ ClusteringSearchInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_complex_filename);
+ output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+ output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2" << endl;
+ for(int i = 0; i < thread_num; i++){
+ for (int j = 0; j < complex_match_records[i]->size(); j++){
+ if(complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos){
+ output_complex_file << complex_match_records[i]->at(j);
+ }
+ }
+ }
+ output_complex_file.close();
+
+ for(int j = 0; j < thread_num; j++){
+ delete complex_match_records[j];
+ }
+ delete [] complex_match_records;
+}
+
+// private
+int VCF::GetRefSnpNumber(int & indel_num) {
+ int result = 0;
+ indel_num = 0;
+ if (clustering_search) {
+ for (auto it = cluster_snps_map.begin(); it != cluster_snps_map.end(); it++) {
+ auto v = it->second;
+ for (int i = 0; i < v.size(); i++) {
+ if (v[i].flag == 1) {
+ result++;
+ if (v[i].ref.length() != v[i].alt.length())
+ indel_num++;
+ }
+ }
+ }
+ }else{
+ for (int i = 0; i < refpos_2_snp.size(); i++) {
+ for (auto it = refpos_2_snp[i].begin(); it != refpos_2_snp[i].end(); ++it) {
+ auto v = it->second;
+ result += v.size();
+ for (int k = 0; k < v.size(); k++) {
+ if (v[k].ref.length() != v[k].alt.length())
+ indel_num++;
+ }
+ }
+ }
+ }
+ return result;
+}
+
+// private
+int VCF::GetQuerySnpNumber(int & indel_num) {
+ int result = 0;
+ indel_num = 0;
+ if (clustering_search) {
+ for (auto it = cluster_snps_map.begin(); it != cluster_snps_map.end(); it++) {
+ auto v = it->second;
+ for (int i = 0; i < v.size(); i++) {
+ if (v[i].flag == -1) {
+ result++;
+ if (v[i].ref.length() != v[i].alt.length())
+ indel_num++;
+ }
+ }
+ }
+ }else{
+ for (int i = 0; i < querypos_2_snp.size(); i++) {
+ for (auto it = querypos_2_snp[i].begin(); it != querypos_2_snp[i].end(); ++it) {
+ auto v = it->second;
+ result += v.size();
+ for (int k = 0; k < v.size(); k++) {
+ if (v[k].ref.length() != v[k].alt.length())
+ indel_num++;
+ }
+ }
+ }
+ }
+ return result;
+}
+
+// public
+void VCF::Compare(string ref_vcf,
+ string query_vcf,
+ string genome_seq,
+ bool direct_search,
+ string output_prefix,
+ bool match_genotype,
+ bool normalization){
+
+ ref_vcf_filename = ref_vcf;
+ que_vcf_filename = query_vcf;
+ this->match_genotype = match_genotype;
+ this->normalization = normalization;
+ output_stat_filename = output_prefix + ".stat";
+ output_simple_filename = output_prefix + ".simple";
+ output_complex_filename = output_prefix + ".complex";
+
+ //------------read genome sequence and decide boundary according to thread number
+ dsptime();
+ dout << " Read genome sequence file... " << endl;
+ ReadGenomeSequence(genome_seq);
+ dsptime();
+ dout << " Finish reading genome sequence file." << endl;
+ //------------read ref and query vcf file
+ dsptime();
+ dout << " Read reference vcf file... " << endl;
+ ReadRefVCF(ref_vcf);
+ dsptime();
+ dout << " Read query vcf file... " << endl;
+ ReadQueryVCF(query_vcf);
+ dsptime();
+ dout << " Finish reading all vcf file." << endl;
+
+ //------------check vcf entry number before matching
+ int ref_total_indel_num, que_total_indel_num;
+ int ref_total_num = GetRefSnpNumber(ref_total_indel_num);
+ int que_total_num = GetQuerySnpNumber(que_total_indel_num);
+ dout << " referece vcf entry number [total, indel]: " << ref_total_num << "," << ref_total_indel_num << endl;
+ dout << " query vcf entry number: [total, indel] " << que_total_num << "," << que_total_indel_num << endl;
+
+
+ //------------direct search
+ dsptime();
+ dout << " Direct search ... " << endl;
+ DirectSearchMultiThread();
+ dsptime();
+ dout << " Finish direct search." << endl;
+ int ref_direct_left_indel_num, que_direct_left_indel_num;
+ int ref_direct_left_num = GetRefSnpNumber(ref_direct_left_indel_num);
+ int que_direct_left_num = GetQuerySnpNumber(que_direct_left_indel_num);
+ int ref_direct_match_num = ref_total_num - ref_direct_left_num;
+ int que_direct_match_num = que_total_num - que_direct_left_num;
+ int ref_direct_match_indel_num = ref_total_indel_num - ref_direct_left_indel_num;
+ int que_direct_match_indel_num = que_total_indel_num - que_direct_left_indel_num;
+ dout << " referece vcf entry direct match number [total, indel]: " << ref_direct_match_num << "," << ref_direct_match_indel_num << endl;
+ dout << " query vcf entry direct match number [total, indel]: " << que_direct_match_num << "," << que_direct_match_indel_num << endl;
+
+ if (direct_search){
+ dout << " referece vcf entry mismatch number [total, indel]: " << ref_direct_left_num << "," << ref_direct_left_indel_num << endl;
+ dout << " query vcf entry mismatch number [total, indel]: " << que_direct_left_num << "," << que_direct_left_indel_num << endl;
+ ofstream output_stat_file;
+ output_stat_file.open(output_stat_filename);
+ output_stat_file << ref_total_num << endl;
+ output_stat_file << que_total_num << endl;
+ output_stat_file << ref_direct_match_num << endl;
+ output_stat_file << que_direct_match_num << endl;
+ output_stat_file << ref_direct_left_num << endl;
+ output_stat_file << que_direct_left_num << endl;
+ //=====================================================
+ output_stat_file << ref_total_indel_num << endl;
+ output_stat_file << que_total_indel_num << endl;
+ output_stat_file << ref_direct_match_indel_num << endl;
+ output_stat_file << que_direct_match_indel_num << endl;
+ output_stat_file << ref_direct_left_indel_num << endl;
+ output_stat_file << que_direct_left_indel_num << endl;
+ output_stat_file.close();
+
+ return;
+ }
+
+ //-------------clustering search
+ dsptime();
+ dout << " Clustering snps ... " << endl;
+ ClusteringSnps();
+ dsptime();
+ dout << " Finish clustering." << endl;
+ dsptime();
+ dout << " Clustering search ... " << endl;
+ ClusteringSearchMultiThread();
+ dsptime();
+ dout << " Finish clustering search." << endl;
+ int ref_cluster_left_indel_num, que_cluster_left_indel_num;
+ int ref_cluster_left_num = GetRefSnpNumber(ref_cluster_left_indel_num);
+ int que_cluster_left_num = GetQuerySnpNumber(que_cluster_left_indel_num);
+ int ref_cluster_match_num = ref_direct_left_num - ref_cluster_left_num;
+ int que_cluster_match_num = que_direct_left_num - que_cluster_left_num;
+ int ref_cluster_match_indel_num = ref_direct_left_indel_num - ref_cluster_left_indel_num;
+ int que_cluster_match_indel_num = que_direct_left_indel_num - que_cluster_left_indel_num;
+
+ dout << " referece vcf entry cluster match number [total, indel]: " << ref_cluster_match_num << "," << ref_cluster_match_indel_num << endl;
+ dout << " query vcf entry cluster match number [total, indel]: " << que_cluster_match_num << "," << que_cluster_match_indel_num << endl;
+
+ dout << " referece vcf entry mismatch number [total, indel]: " << ref_cluster_left_num << "," << ref_cluster_left_indel_num << endl;
+ dout << " query vcf entry mismatch number [total, indel]: " << que_cluster_left_num << "," << que_cluster_left_indel_num << endl;
+
+ //write stat file
+ ofstream output_stat_file;
+ output_stat_file.open(output_stat_filename);
+ output_stat_file << ref_total_num << endl;
+ output_stat_file << que_total_num << endl;
+ output_stat_file << ref_direct_match_num << endl;
+ output_stat_file << que_direct_match_num << endl;
+ output_stat_file << ref_cluster_match_num << endl;
+ output_stat_file << que_cluster_match_num << endl;
+ output_stat_file << ref_cluster_left_num << endl;
+ output_stat_file << que_cluster_left_num << endl;
+ //=====================================================
+ output_stat_file << ref_total_indel_num << endl;
+ output_stat_file << que_total_indel_num << endl;
+ output_stat_file << ref_direct_match_indel_num << endl;
+ output_stat_file << que_direct_match_indel_num << endl;
+ output_stat_file << ref_cluster_match_indel_num << endl;
+ output_stat_file << que_cluster_match_indel_num << endl;
+ output_stat_file << ref_direct_left_indel_num << endl;
+ output_stat_file << que_direct_left_indel_num << endl;
+ output_stat_file.close();
+
+ return;
+}
diff --git a/src/vcf.h b/src/vcf.h
new file mode 100644
index 0000000..77d2754
--- /dev/null
+++ b/src/vcf.h
@@ -0,0 +1,210 @@
+#pragma once // the same purpose as #include guards
+
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+#include "util.h"
+
+#include <iostream>
+using namespace std;
+
+
+typedef struct SNP {
+ SNP(int pos_ = 0,
+ char snp_type_ = 'S',
+ string ref_ = "",
+ string alt_ = "",
+ string genotype_ = "1/1",
+ int flag_=1) :
+ pos(pos_),
+ snp_type(snp_type_),
+ ref(ref_),
+ alt(alt_),
+ genotype(genotype_),
+ flag(flag_){}
+
+ int pos;
+ char snp_type;
+ string ref;
+ string alt;
+ string genotype;
+ int flag;
+}SNP;
+
+// define outside of struct, idiomatic solution for lexicographical compare for structures
+bool operator <(const SNP& x, const SNP& y);
+
+bool operator ==(const SNP& x, const SNP& y);
+
+typedef vector<unordered_map<int, vector<SNP> > > SnpHash;
+typedef vector<map<int, vector<SNP> > > SnpMap;
+
+class VCF
+{
+private:
+ int debug_f;
+
+ bool complex_search;
+
+ void ReadVCF(string filename, SnpHash & pos_2_snps);
+ void DirectSearchInThread(unordered_map<int, vector<SNP> > & ref_snps,
+ unordered_map<int, vector<SNP> > & query_snps,
+ int thread_index);
+
+ //template function can only be defined in head file
+ template <typename T>
+ vector<vector<T>> CreateCombinationsWithTarget(vector<T> dict, int k, vector<int> changes, int target) {
+ //vector<vector<T>> CreateCombinations(vector<T> dict, int k) {
+ vector<vector<T>> result;
+ int n = dict.size();
+ vector<bool> v(n);
+ fill(v.begin(), v.end() - n + k, true);
+
+ do {
+ vector<T> t;
+ int sum = 0;
+ for (int i = 0; i < n; ++i) {
+ if (v[i]){
+ t.push_back(dict[i]);
+ sum += changes[i];
+ }
+ }
+ if(sum == target){
+ result.push_back(t);
+ }
+ } while (prev_permutation(v.begin(), v.end()));
+ return result;
+ }
+
+
+ void ClusteringSearchInThread(int start, int end, int thread_index);
+
+ //-------------------------following can be public--------------------------
+ // but for a better OO design, made them private
+ string ref_mismatch_filename;
+ string que_mismatch_filename;
+
+ // data structure for direct search
+ SnpHash refpos_2_snp;
+ SnpHash querypos_2_snp;
+
+ // data structure for complex search
+ SnpMap refpos_snp_map;
+ SnpMap querypos_snp_map;
+
+ void ReadRefVCF(string filename);
+ void ReadQueryVCF(string filename);
+ void DirectSearchMultiThread();
+ virtual void ClusteringSnps();
+ // default value better be in declaration, or definition, but never both
+ void ClusteringSearchMultiThread();
+
+ int GetRefSnpNumber(int & indel_num);
+ int GetQuerySnpNumber(int & indel_num);
+
+ //---------------------------above can be public:---------------------------
+
+protected:
+ vector<int> pos_boundries; // boundries for split multi hash table
+ bool boundries_decided; // before deciding boundries, can not read vcf file, because do not know how to split
+ bool normalization;
+ // for inherit
+ bool match_genotype;
+ bool clustering_search;
+
+ int thread_num;
+ string chromosome_name;
+ string genome_sequence; // genome sequence from fasta file
+ const static int MAX_REPEAT_LEN = 1000;
+ const static int GENOTYPE_COLUMN_NUM = 10;
+
+ // data structure for clustering search
+ vector<SNP> data_list;
+ vector<int> cluster_list;
+ map<int, vector<SNP> > cluster_snps_map;
+ vector<unordered_map<int, int> > haplotype_matching_check; //vector is for multi-thread,
+
+ // storing complex match results
+ //std::mutex complex_match_mutex;
+ //vector<vector<string>> complex_match_records;
+
+ //lock free dynamic array of vector pointers for storing.
+ vector<string> ** direct_match_records;
+ vector<string> ** complex_match_records;
+
+ // for output
+ string ref_vcf_filename;
+ string que_vcf_filename;
+ string output_stat_filename;
+ string output_simple_filename;
+ string output_complex_filename;
+
+ bool CompareSnps(SNP r, SNP q);
+ virtual void DecideBoundaries();
+ string ModifySequenceBySnp(const string sequence, SNP s, int offset);
+ string ModifySequenceBySnpList(const string sequence, vector<SNP> s, int offset);
+ bool CheckTandemRepeat(string sequence, int unit_threshold);
+ void ReadGenomeSequence(string filename);
+
+ bool MatchSnpLists(vector<SNP> & ref_snp_list,
+ vector<SNP> & query_snp_list,
+ vector<SNP> & mixed_list,
+ const string subsequence,
+ int offset,
+ int thread_index);
+
+ template <typename D>
+ vector<vector<D>> CreateCombinations(vector<D> dict, int k) {
+ vector<vector<D>> result;
+ int n = dict.size();
+ vector<bool> v(n);
+ fill(v.begin(), v.end() - n + k, true);
+ do {
+ vector<D> t;
+ int sum = 0;
+ for (int i = 0; i < n; ++i) {
+ if (v[i]) {
+ t.push_back(dict[i]);
+ }
+ }
+ result.push_back(t);
+ } while (prev_permutation(v.begin(), v.end()));
+ return result;
+ }
+ bool MatchSnpListsWithWeight(vector<SNP> & ref_snp_list,
+ vector<SNP> & query_snp_list,
+ vector<SNP> & mixed_list,
+ const string subsequence,
+ int offset,
+ int thread_index);
+ bool CheckVariantOverlap(vector<SNP> snp_list);
+ bool NormalizeSnp(int pos, string ref, string alt, string & parsimonious_ref, string & parsimonious_alt);
+
+ inline void ToUpper(string & s){
+ transform(s.begin(), s.end(), s.begin(), ::toupper);
+ }
+
+public:
+
+ VCF(int thread_num_ = 0);
+ ~VCF();
+
+ static bool static_match_genotype; //global variable
+ // for public access
+ void Compare(string ref_vcf,
+ string query_vcf,
+ string genome_seq,
+ bool direct_search,
+ string output_prefix,
+ bool match_genotype,
+ bool normalization);
+
+};
diff --git a/src/vm.cpp b/src/vm.cpp
new file mode 100644
index 0000000..6176e0a
--- /dev/null
+++ b/src/vm.cpp
@@ -0,0 +1,233 @@
+// concurrent.cpp : Defines the entry point for the console application.
+//
+//#include "stdafx.h"
+#include <iostream>
+#include <thread>
+#include <tclap/CmdLine.h>
+#include "wholegenome.h"
+
+using namespace std;
+
+typedef struct Args {
+ string ref_vcf_filename;
+ string que_vcf_filename;
+ string genome_seq_filename;
+ string output_dir;
+ string output_prefix;
+ int thread_num;
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+ bool detail_results;
+ vector<string> query_file_list;
+ bool pr_curves;
+ bool direct_match;
+
+// bool direct_search;
+// string chr_name;
+// string stat_filename;
+// bool remove_duplicates;
+// string single_vcf_filename;
+// bool match_genotype;
+// bool normalization;
+// bool score_basepair;
+// bool overlap_match;
+// bool variant_check; // check if variant matches
+// bool whole_genome;
+}Args;
+
+bool TclapParser(Args & args, int argc, char** argv){
+ string version = "0.9";
+
+ try {
+ std::string desc = "Please cite our paper if you are using this program in your research. \n";
+ TCLAP::CmdLine cmd(desc, ' ', version);
+ //TCLAP::ValueArg<std::string> arg_input_vcf_file("i", "i", "input VCF file", true, "", "file", cmd);
+ TCLAP::ValueArg<std::string> arg_genome_seq_filename("g", "genome_sequence", "genome sequence FASTA file", true, "", "file");
+ TCLAP::ValueArg<std::string> arg_baseline_vcf_filename("b", "baseline", "baseline variant VCF file", true, "", "file");
+ TCLAP::MultiArg<std::string> arg_query_vcf_filename("q", "query", "query variant VCF file list", true, "file list");
+ TCLAP::ValueArg<std::string> arg_output_dir("o", "output_dir", "output directory, default is current working directory", false, ".", "string");
+ TCLAP::ValueArg<std::string> arg_output_prefix("p", "file_prefix", "output filename prefix, default is \"out\"", false, "out", "string");
+ int thread_num = (int)thread::hardware_concurrency();
+
+
+ int max_cores = (int)thread::hardware_concurrency();
+ if(max_cores <= 0) max_cores = 1;
+
+ string thread_string = "number of threads, default is the number of available cores (For this machine: " + to_string(max_cores) + ").\n"
+ "If larger than number of available cores or less than 1, automatically set to default value";
+ TCLAP::ValueArg<int> arg_thread_num("t", "thread_num", thread_string, false, thread_num, "int");
+ vector<int> allowed_two = {-1, 0,1};
+ TCLAP::ValuesConstraint<int> allowedVals(allowed_two);
+
+ string score_unit_string = "scoring function/score unit: (Default: -1)\n"
+ "-1 : iterate both 0 and 1.\n"
+ "0 : the score that a VCF entry contributes is 1.\n"
+ "1 : the score that a VCF entry contributes is the edit distance between the new allele and the reference one.\n";
+
+ TCLAP::ValueArg<int> arg_score_unit("u", "score_unit", score_unit_string, false, -1, &allowedVals);
+
+
+ string match_mode_string = "matching mode: (Default: -1)\n"
+ "-1 : iterate both 0 and 1.\n"
+ "0 : a set of query entries match a set of baseline entries if, "
+ "for each entry, we can select one of the alleles such that the inferred sequences are identical\n"
+ "1 : a set of query entries match a set of baseline entries if there exist a phasing of each set such that "
+ "the two inferred haplotypes from the query are equal to the two inferred haplotypes from the baseline.\n";
+
+ TCLAP::ValueArg<int> arg_match_mode("m", "match_mode", match_mode_string, false, -1, &allowedVals);
+
+
+ string score_scheme_string = "scoring scheme: (Default: -1)\n"
+ "-1 : iterate 0, 1, and 2 (not including 3)\n"
+ "0 : find two subsets of non-overlapping equivalent variants such that "
+ "the score of the matched variants is maximized (Default)\n"
+ "1 : find two subsets of non-overlapping equivalent variants such that"
+ " the score of the chosen baseline variants is maximized\n"
+ "2 : find a maximum scoring set of variants in the query such that"
+ " each variant can be matched by a subset of the baseline variants\n"
+ "3 : (1 to 1 direct match) find a maximum scoring set of entry pairs such that each entry pair contains"
+ " one query and one baseline variant that result in the same sequence. In this scheme, different scoring functions and "
+ "matching mode have no difference.\n";
+ vector<int> allowed_four = {-1,0,1,2,3};
+ TCLAP::ValuesConstraint<int> allowedFour(allowed_four);
+ TCLAP::ValueArg<int> arg_score_scheme("s", "score_scheme", score_scheme_string, false, -1, &allowedFour);
+
+ //string direct_match_string = "Direct Match. \n";
+ //TCLAP::SwitchArg arg_direct_match("d", "direct_match", direct_match_string, cmd, false);
+
+ string detail_results_string = "output detail matching results, by default do not output.\n"
+ "filename in format PREFIX.PARAMETER.match\n"
+ "The results present which variants in baseline match which variants in query.";
+
+ TCLAP::SwitchArg arg_detail_results("e","detail_results", detail_results_string, cmd, false);
+
+ string precision_recall_string = "Disable Precision-Recall curves. \n";
+ TCLAP::SwitchArg arg_disable_curves("C", "disable_curves", precision_recall_string, cmd, false);
+
+ cmd.add(arg_score_scheme);
+ cmd.add(arg_match_mode);
+ cmd.add(arg_score_unit);
+ cmd.add(arg_thread_num);
+ //cmd.add(arg_output_prefix);
+ cmd.add(arg_output_dir);
+ cmd.add(arg_query_vcf_filename);
+ cmd.add(arg_baseline_vcf_filename);
+ cmd.add(arg_genome_seq_filename);
+
+ cmd.parse(argc, argv);
+
+ args.genome_seq_filename = arg_genome_seq_filename.getValue();
+ args.ref_vcf_filename = arg_baseline_vcf_filename.getValue();
+ args.query_file_list = arg_query_vcf_filename.getValue();
+ //args.que_vcf_filename = arg_query_vcf_filename.getValue();
+ args.output_dir = arg_output_dir.getValue();
+ args.output_prefix = arg_output_prefix.getValue();
+ args.thread_num = arg_thread_num.getValue();
+ if(args.thread_num <= 0 || args.thread_num > max_cores) args.thread_num = max_cores;
+ args.score_unit = arg_score_unit.getValue();
+ args.match_mode = arg_match_mode.getValue();
+ args.score_scheme = arg_score_scheme.getValue();
+ args.detail_results = arg_detail_results.getValue();
+ args.pr_curves = ! arg_disable_curves.getValue();
+ //args.direct_match = arg_direct_match.getValue();
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+ abort();
+ }
+ return true;
+}
+
+int usage(char* command) {
+ cout << "\n";
+ cout << "\tPlease cite our paper if you are using this program in your research." << endl;
+ cout << endl;
+ cout << "Usage: " << endl;
+ cout << command << " -g genome file path(FASTA format)" << endl;
+ cout << "\t-r reference VCF file path" << endl;
+ cout << "\t-q query VCF file path" << endl;
+ cout << "\t-o output file prefix" << endl;
+ cout << "\t[-t thread number]" << endl;
+ cout << "\t[-n normalize VCF entries before comparing]" << endl;
+ cout << "\t[-m single VCF file to remove duplicates]" << endl;
+ cout << "\t[-G do not match genotype when match vcf records]" << endl;
+ cout << endl;
+
+ return 0;
+}
+
+int main(int argc, char* argv[])
+{
+// dout << "Debug Mode" << endl;
+// WholeGenome wg(1);
+// wg.test();
+// return 0;
+
+ Args args;
+ TclapParser(args, argc, argv);
+
+ //return 0;
+ WholeGenome wg(args.thread_num,
+ args.output_dir,
+ args.pr_curves);
+
+ // if(args.direct_match){
+ // for(int i = 0; i < args.query_file_list.size(); i++){
+ // string query_filename = args.query_file_list[i];
+ // wg.DirectMatch(args.ref_vcf_filename, query_filename, args.match_mode);
+ // }
+ // return 0;
+ // }
+
+ wg.ReadRef(args.genome_seq_filename,
+ args.ref_vcf_filename);
+
+ // use a loop
+ for(int i = 0; i < args.query_file_list.size(); i++){
+ string query_filename = args.query_file_list[i];
+
+ wg.Compare(query_filename,
+ "query"+to_string(i+1),
+ args.detail_results,
+ args.score_unit,
+ args.match_mode,
+ args.score_scheme);
+ }
+
+ return 0;
+
+//
+// if(args.remove_duplicates){
+// RemoveDuplicate rd(args.thread_num);
+// rd.Deduplicate(args.single_vcf_filename,
+// args.genome_seq_filename,
+// args.direct_search,
+// args.output_filename);
+// return 0;
+// }
+//
+// DiploidVCF dv(args.thread_num);
+// dv.Compare(args.ref_vcf_filename,
+// args.que_vcf_filename,
+// args.genome_seq_filename,
+// args.direct_search,
+// args.output_filename,
+// args.match_genotype,
+// args.normalization,
+// args.score_basepair,
+// args.overlap_match,
+// args.variant_check);
+// return 0;
+//
+// VCF vcf(args.thread_num);
+// vcf.Compare(args.ref_vcf_filename,
+// args.que_vcf_filename,
+// args.genome_seq_filename,
+// args.direct_search,
+// args.output_filename,
+// args.match_genotype,
+// args.normalization);
+ return 0;
+}
diff --git a/src/wholegenome.cpp b/src/wholegenome.cpp
new file mode 100644
index 0000000..bb05500
--- /dev/null
+++ b/src/wholegenome.cpp
@@ -0,0 +1,3341 @@
+#include "wholegenome.h"
+
+using namespace std;
+
+// constructor
+WholeGenome::WholeGenome(int thread_num_,
+ string output_dir_,
+ bool pr_curves){
+
+ thread_num = thread_num_;
+ chrom_num = 24;
+
+ output_dir = output_dir_;
+
+ //thread_num = thread_num_;
+ //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+
+ ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+
+ for (int j = 0; j < chrom_num; j++) {
+ ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ que_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ // chr_id starts from 0
+ for(int j = 1; j <= 22; j++){
+ string chr_name = to_string(j);
+ chrname_dict[chr_name] = j-1;
+ chr_name = "chr"+chr_name;
+ chrname_dict[chr_name] = j-1;
+ }
+ chrname_dict["X"] = 22;
+ chrname_dict["chrX"] = 22;
+ chrname_dict["Y"] = 23;
+ chrname_dict["chrY"] = 23;
+
+ if(pr_curves){
+ per_list = {0.0, 0.1, 0.2, 0.3, 0.9};
+ }else{
+ per_list = {0.0};
+ }
+
+}
+
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+ int result = 0;
+ result |= score_unit & 1;
+ result <<= 1;
+ result |= match_mode & 1;
+ result <<= 2;
+ result |= score_scheme & 3;
+ return result;
+}
+
+// distructor
+WholeGenome::~WholeGenome(){
+
+ for(int j = 0; j < chrom_num; j++){
+ ref_variant_by_chrid[j]->clear();
+ delete ref_variant_by_chrid[j];
+ que_variant_by_chrid[j]->clear();
+ delete que_variant_by_chrid[j];
+ }
+ delete[] ref_variant_by_chrid;
+ delete[] que_variant_by_chrid;
+}
+
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+ std::ifstream input(filename);
+ if(!input.good()){
+ std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+ return false;
+ }
+
+ std::string line, name, content;
+ int real_chrom_num = 0;
+ int chr_id = 0;
+ int current_id = -1;
+ while( std::getline( input, line ).good() ){
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ //int chr_id = chrname_dict[name];
+ if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+ chrid_by_chrname[name] = chr_id;
+ chr_id++;
+ }
+ current_id = chrid_by_chrname[name];
+ chrname_by_chrid[current_id] = name;
+ genome_sequences[current_id] = content;
+ real_chrom_num++;
+ name.clear();
+ }
+ if( !line.empty() ){
+ name = split(line, ' ')[0].substr(1);
+ }
+ content.clear();
+ } else if( !name.empty() ){
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+ name.clear();
+ content.clear();
+ } else {
+ content += line;
+ }
+ }
+ }
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+ chrid_by_chrname[name] = chr_id;
+ chr_id++;
+ }
+ current_id = chrid_by_chrname[name];
+ chrname_by_chrid[current_id] = name;
+ genome_sequences[current_id] = content;
+ real_chrom_num++;
+ }
+ // test
+
+ chrom_num = real_chrom_num;
+ //dout << "detected chromosome num: " << chrom_num << endl;
+// for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+// cout << it->first << ":" << (it->second).length();
+// }
+ return true;
+}
+
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+
+}
+
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+ int total_num = 0;
+ int long_num = 0;
+ double QUAL_LOWER_BOUND = 0.1;
+
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return -1;
+ }
+
+ vector<float> quality_list;
+
+ int genotype_index = -1;
+ char genotype_separator = '/';
+ //int genome_sequence_length = genome_sequence.length();
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ //if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#') {
+ continue;
+ }
+ auto columns = split(line, '\t');
+ if (columns.size() < 10) {
+ if(match_mode_indicator != 1){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+ match_mode_indicator = 1;
+ //continue;
+ }
+ if(columns.size() < 6){
+ cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+ cout << "[VarMatch] skip current variant: " << line << endl;
+ continue;
+ }
+ }
+ string chr_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+
+ auto ref = columns[3];
+ auto alt_line = columns[4];
+ double quality = stod(columns[5]);
+
+ if(flag){
+ quality_list.push_back(quality);
+ }
+
+ ToUpper(ref);
+ ToUpper(alt_line);
+
+ bool is_heterozygous_variant = false;
+ bool is_multi_alternatives = false;
+ bool is_zero_one_var = false;
+
+ vector<string> genotype_columns;
+
+ if (match_mode_indicator != 1) { // match mode indicator is -1 or 0
+ if (genotype_index < 0) {
+ // change genotype index
+ auto formats = split(columns[8], ':');
+ for (int i = 0; i < formats.size(); i++) {
+ if (formats[i] == "GT") {
+ genotype_index = i;
+ break;
+ }
+ }
+ // if GT not found
+ if(genotype_index < 0){
+ if(match_mode_indicator != 1 && match_mode_indicator != 1){
+ cout << "[VarMatch] Warning: VCF entry does not contain genotype information." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching mode. " << endl;
+ match_mode_indicator = 1;
+ }
+ }
+ }
+
+
+ if(match_mode_indicator != 1){
+
+ auto additionals = split(columns[9], ':');
+ genotype_columns = split(additionals[genotype_index], genotype_separator);
+
+ if(genotype_columns.size() != 2){
+
+ if(genotype_separator == '/'){
+ genotype_separator = '|';
+ }else{
+ genotype_separator = '/';
+ }
+ genotype_columns = split(additionals[genotype_index], genotype_separator);
+ }
+
+ // normalize format of genotype: sorted, separated by |
+ if (genotype_columns.size() != 2) {
+ cout << "[VarMatch] Warning: Unrecognized Genotype: " << additionals[genotype_index] << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching mode." << endl;
+ match_mode_indicator = 1;
+ }
+ else {
+ if (genotype_columns[0] != genotype_columns[1]) {
+ is_heterozygous_variant = true;
+ }
+ if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+ //cout << "Skip Variants when both genotype is refernce allele: " << line << endl;
+ continue;
+ }
+ if(genotype_columns[0] == "0" || genotype_columns[1] == "0"){
+ is_zero_one_var = true;
+ }
+ }
+ }
+ }
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos) {
+ alt_list = split(alt_line, ',');
+ is_multi_alternatives = true;
+ }
+ else {
+ alt_list.push_back(alt_line);
+ }
+
+ if(alt_list.size() > 2){
+ if(match_mode_indicator != 1){
+ vector<string> temp_alt_list = alt_list;
+ alt_list.clear();
+ for(int i = 0; i < 2; i++){
+ int alt_indicator = stoi(genotype_columns[i]);
+ if(alt_indicator == 0) continue;
+ alt_list.push_back(temp_alt_list[alt_indicator-1]);
+ }
+ }else{
+ vector<string> temp_alt_list = alt_list;
+ alt_list.clear();
+ alt_list.push_back(temp_alt_list[0]);
+ alt_list.push_back(temp_alt_list[1]);
+ }
+ }
+
+ int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+ int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+ if(is_multi_alternatives){
+ snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+ snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+ }
+
+ if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+ //dout << "[VarMatch] skip large INDEL with length > " << VAR_LEN << "| "<< line <<endl;
+ long_num ++;
+ continue;
+ }
+
+ DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality, is_zero_one_var);
+ //if (normalization) {
+ //NormalizeDiploidVariant(dv);
+ //}
+ if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+ int chr_id = chrid_by_chrname[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }else{
+ cout << "[VarMatch] skip current variant as no corresponding reference genome sequence found." << endl;
+ continue;
+ int chr_id = chrname_dict[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }
+
+ total_num++;
+ }
+ vcf_file.close();
+
+ if(flag){
+ sort(quality_list.begin(), quality_list.end());
+ auto qual_lower_it = lower_bound(quality_list.begin(), quality_list.end(), QUAL_LOWER_BOUND);
+ int qual_lower_index = qual_lower_it - quality_list.begin();
+ int rest_size = quality_list.size() - qual_lower_index;
+
+ vector<float> temp_percentage_list;
+ temp_percentage_list.push_back(0.0);
+ threshold_list.push_back(0.0);
+
+ for(int i = 1; i < per_list.size(); i++){
+ int additional_index = (int)(rest_size * per_list[i]);
+ int real_index = qual_lower_index + additional_index;
+ if(real_index >= quality_list.size()) real_index = quality_list.size() - 1;
+ double quality = quality_list[real_index];
+ threshold_list.push_back(quality);
+
+ auto quality_lowit = lower_bound(quality_list.begin(), quality_list.end(), quality);
+ int quality_low_index = quality_lowit - quality_list.begin();
+ // following program will retain variants >= quality threshold
+
+ int quality_size = quality_low_index + 1; // counting number, +/- 1 does not matter
+ if(quality_size > quality_list.size()) quality_size = quality_list.size();
+ double percentage = (double)quality_size/ quality_list.size();
+ temp_percentage_list.push_back(percentage);
+ }
+ threshold_num = threshold_list.size();
+ // revice percentage
+ per_list = temp_percentage_list;
+ }
+ cout << flag << "," << total_num << "," << long_num << endl;
+ return total_num;
+}
+
+bool WholeGenome::ReadVariantFileList(string filename){
+
+}
+
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+ return EditDistance(dv.ref, dv.alts[allele_indicator]);
+}
+
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+{
+ const std::size_t len1 = s1.size(), len2 = s2.size();
+ std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+
+ for (unsigned int i = 0; i < prevCol.size(); i++)
+ prevCol[i] = i;
+ for (unsigned int i = 0; i < len1; i++) {
+ col[0] = i+1;
+ for (unsigned int j = 0; j < len2; j++)
+ // note that std::min({arg1, arg2, arg3}) works only in C++11,
+ // for C++98 use std::min(std::min(arg1, arg2), arg3)
+ col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+ col.swap(prevCol);
+ }
+ return prevCol[len2];
+}
+
+// Needleman Wunsch Initialization
+inline void WholeGenome::initialize_score_matrix(int **score, char **trackBack, int M, int N)
+{
+ for (int i = 0; i < M+1; i++)
+ {
+ score[0][i] = i * -1;
+ trackBack[0][i] = '-';
+ }
+
+ for (int i = 0; i < N+1; i++)
+ {
+ score[i][0] = i * -1;
+ trackBack[i][0] = '|';
+ }
+
+ trackBack[0][0] = '*';
+}
+
+int WholeGenome::needleman_wunsch(string S1, string S2, string &R1, string &R2)
+{
+ int M = S1.length();
+ int N = S2.length();
+ /*
+ 0MMMMMMMMMMMMMMMM
+ N
+ N
+ N
+ N
+ N
+ N
+ so the matrix is N*M
+ */
+ int **score = new int *[N+1];
+ for (int i = 0; i <= N; i++)
+ {
+ score[i] = new int [M+1];
+ }
+
+ char **trackBack = new char *[N+1];
+ // * for match, - for ->, | for moving downward
+ for (int i = 0; i <= N; i++)
+ {
+ trackBack[i] = new char [M+1];
+ }
+ R1 = "";
+ R2 = "";
+ initialize_score_matrix(score, trackBack, M, N);
+
+ for (int i = 1; i <=N; i++)
+ {
+ for (int k = 1; k <= M; k++)
+ {
+ char S1_k = S1[k-1];
+ char S2_i = S2[i-1];
+ int matchingCost = score[i-1][k-1];
+ if(S1_k != S2_i) matchingCost--;
+ int rightCost = score[i][k-1] - 1;
+ int downCost = score[i-1][k] - 1;
+ if (matchingCost > rightCost && matchingCost > downCost)
+ {
+ score[i][k] = matchingCost;
+ trackBack[i][k] = '*';
+ }else if(rightCost >= downCost)
+ {
+ score[i][k] = rightCost;
+ trackBack[i][k] = '-';
+ }else
+ {
+ score[i][k] = downCost;
+ trackBack[i][k] = '|';
+ }
+ }
+ }
+
+ //trackBack
+ int n = N;
+ int m = M;
+ while(n > 0 || m > 0)
+ {
+ if (trackBack[n][m] == '*')
+ {
+ R1 += S1[m-1];
+ R2 += S2[n-1];
+ n--;
+ m--;
+ }else if(trackBack[n][m] == '-')
+ {
+ R1 += S1[m-1];
+ R2 += '-';
+ m--;
+ }else if(trackBack[n][m] == '|')
+ {
+ R1 += '-';
+ R2 += S2[n-1];
+ n--;
+ }
+ }
+ reverse(R1.begin(), R1.end());
+ reverse(R2.begin(), R2.end());
+
+ int result = score[N][M];
+
+ for (int i = 0; i <= N; i++)
+ {
+ delete score[i];
+ delete trackBack[i];
+ }
+
+ delete score;
+ delete trackBack;
+
+ return result;
+}
+
+void WholeGenome::GenerateAltVector(string ref, string alt, vector<string> & alt_vector){
+ if(ref.size() == 0) return;
+ string ref_match = "";
+ string alt_match = "";
+ needleman_wunsch(ref, alt, ref_match, alt_match);
+ int current_ref_index = -1;
+ for(int i = 0; i < ref.size(); i++){
+ alt_vector.push_back("");
+ }
+
+ for(int i = 0; i < ref_match.size(); i++){
+ if(ref_match[i] == '-'){
+ if(current_ref_index < 0){
+ alt_vector[0].push_back(alt_match[i]);
+ }else{
+ alt_vector[current_ref_index].push_back(alt_match[i]);
+ }
+ }else if(alt_match[i] == '-'){
+ // pass
+ current_ref_index ++;
+ }else{
+ current_ref_index ++;
+ if(current_ref_index >= ref.size()){
+ alt_vector[ref.size()-1].push_back(alt_match[i]);
+ }
+ alt_vector[current_ref_index].push_back(alt_match[i]);
+ }
+ }
+ return;
+}
+
+bool WholeGenome::ParallelClustering(){
+ // parallel by chr
+ variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+ }
+
+ int parallel_steps = chrom_num / thread_num;
+ if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+ int chr_id = 0;
+ for(int i = 0; i < parallel_steps; i++){
+ vector<thread> threads;
+ for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ if(ref_variant_by_chrid[chr_id]->size() > 0 && que_variant_by_chrid[chr_id]->size() > 0){
+ threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+ }
+ }
+ chr_id ++;
+ }
+ if(chr_id < chrom_num){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ SingleThreadClustering(chr_id);
+ }
+ chr_id ++;
+ }
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+ threads.clear();
+ }
+
+
+ for(int i = 0; i < chrom_num; i++){
+ if(variant_cluster_by_chrid[i]->size() > 0){
+ variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+ }
+ }
+
+ // test output
+ //dout << endl;
+ map<int, int> size_num;
+ map<int, int> size_chrid;
+ for(int i = 0; i < chrom_num; i++){
+ //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+ for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+ int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+ if(size_num.find(temp_size) != size_num.end()){
+ size_num[temp_size] ++;
+ }else{
+ size_num[temp_size] = 1;
+ }
+ if(size_chrid.find(temp_size) == size_chrid.end()){
+ size_chrid[temp_size] = i;
+ }
+ }
+ }
+
+ //cout << endl;
+ //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+ // dout << it->first << ": " << it->second << endl;
+ //}
+
+// cout << endl;
+// cout << "size and location:" << endl;
+// for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+// dout << it->first << ": " << it->second << endl;
+// }
+ // clean at the end of function
+
+ for(int j = 0; j < chrom_num; j++){
+ variant_cluster_by_chrid[j]->clear();
+ delete variant_cluster_by_chrid[j];
+ }
+ delete[] variant_cluster_by_chrid;
+
+ return true;
+}
+
+bool WholeGenome::ParallelMatching(){
+
+}
+
+bool WholeGenome::TBBMatching()
+{
+
+}
+
+
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+ int sequence_length = (int)sequence.length();
+ //cout << sequence_length << "," << unit_threshold << endl;
+ if(sequence_length == 1) return true;
+ transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+ int end_index = sequence_length / 2 + 1;
+ bool final_checking = false;
+ int repeat_threshold = min(end_index-1, unit_threshold);
+ for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+ bool is_tandem_repeat = true;
+ int repeat_time = 1;
+ string repeat_region = sequence.substr(0, repeat_length);
+ int start_position = repeat_length;
+ while (start_position < sequence_length) {
+ if (start_position + repeat_length > sequence_length)
+ break;
+ string matching_region = sequence.substr(start_position, repeat_length);
+ if (matching_region != repeat_region) {
+ is_tandem_repeat = false;
+ break;
+ }
+ start_position += repeat_length;
+ repeat_time ++;
+ }
+ if (is_tandem_repeat && repeat_time > 1) {
+ final_checking = true;
+ break;
+ }
+ }
+ return final_checking;
+}
+
+// preprocess
+bool WholeGenome::MatchVariantListInThread(int thread_index,
+ int threshold_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id){
+ //===================================================
+ sort(variant_list.begin(), variant_list.end());
+ // decide reference sequence
+ vector<DiploidVariant> separate_var_list[2];
+ vector<Interval> intervals;
+ // separate into ref and que
+ int total_mil = 0;
+ int total_mdl = 0;
+ int min_pos = genome_sequences[chr_id].length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = 0;
+ if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ total_mil += variant_list[i].mil;
+ total_mdl += variant_list[i].mdl;
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+
+ int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+ intervals.push_back(Interval(pos, end_pos));
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ // here we need to push back for all mode_index
+ //complex_match_records[thread_index]->push_back(match_record);
+
+ int edit_distance = CalculateEditDistance(tv, 0, 0);
+ for(int mi = 0; mi < mode_index_list.size(); mi ++){
+ int mode_i = mode_index_list[mi];
+ //if(mi == 0){
+
+
+ // this line should be recovered
+ match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+
+ //}else{
+ // match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$"+to_string(match_records_by_mode_by_thread[thread_index][0]->size()));
+ // use dollor to represent that it is the same
+ //}
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+ query_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+
+ baseline_total_edit_distance[thread_index][threshold_index]->at(mode_i) += edit_distance;
+ query_total_edit_distance[thread_index][threshold_index]->at(mode_i) += edit_distance;
+ //calculate the edit distance
+ }
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+ int flag = 0;
+ if(separate_var_list[1].size() == 1) flag = 1;
+ int r_flag = 1-flag;
+ if(separate_var_list[r_flag].size() > 4){
+ int total_r_mdl = 0;
+ int total_r_mil = 0;
+
+ for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+ DiploidVariant var = separate_var_list[r_flag][k];
+ int var_mdl = var.mdl;
+ int var_mil = var.mil;
+ int ref_length = var.ref.length();
+ total_r_mdl += var_mdl;
+ total_r_mil += var_mil;
+ }
+
+ if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+ }
+ }
+
+ separate_var_list[0].clear();
+ separate_var_list[1].clear();
+ // remove singular variant
+ // [todo] try removing this filter to see running time changes
+ vector<bool> appliable_flag;
+ int total_change = total_mil+total_mdl;
+
+ if(variant_list.size() > EASY_MATCH_VAR_NUM){
+ for(int k = 0; k < variant_list.size(); k++){
+ DiploidVariant cur_var = variant_list[k];
+ int max_change = max(cur_var.mil, cur_var.mdl);
+ if(max_change > total_change-max_change){
+ appliable_flag.push_back(false);
+ //dout << "this variant is removed" << endl;
+ }else{
+ appliable_flag.push_back(true);
+ }
+ }
+ }else{
+ for(int k = 0; k < variant_list.size(); k++){
+ appliable_flag.push_back(true);
+ }
+ }
+
+ string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+
+ // have subsequence in hand
+ //generate decision point
+ multimap<int, int> * choices_by_pos[2];
+ // choice by pos is to also equal to var by pos
+ for(int i = 0; i < 2; i++){
+ choices_by_pos[i] = new multimap<int, int>();
+ }
+
+ for(int index = 0; index < variant_list.size(); index++){
+ if(!appliable_flag[index]) continue;
+ // remove decision point if not applicable
+ int pos = variant_list[index].pos - offset;
+ int flag = 0;
+ if(variant_list[index].flag) flag = 1;
+ choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+ //dout << pos << index << endl;
+ }
+
+ vector<Interval> mergered_intervals = merge(intervals);
+// unordered_map<int, bool> sync_points;
+// for(int i = 0; i < mergered_intervals.size(); i++){
+// sync_points[mergered_intervals[i].end-offset] = true;
+// }
+ vector<int> sync_points;
+ for(int i = 0; i < mergered_intervals.size(); i++){
+ sync_points.push_back(mergered_intervals[i].end-offset);
+ }
+
+ if(sync_points.back() < subsequence.size() - 1){
+ sync_points.push_back(subsequence.size()-1);
+ }
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ score_unit = score_unit_list[i];
+ for(int j = 0; j < match_mode_list.size(); j++){
+ match_mode = match_mode_list[j];
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ score_scheme = score_scheme_list[k];
+
+ bool method2 = MatchingSingleClusterBaseExtending(
+ cluster_id,
+ thread_index,
+ variant_list,
+ subsequence,
+ offset,
+ choices_by_pos,
+ sync_points,
+ chr_id,
+ score_unit,
+ match_mode,
+ score_scheme,
+ threshold_index);
+ }
+ }
+ }
+
+ for(int i = 0; i < 2; i++){
+ delete choices_by_pos[i];
+ }
+ //delete choices_by_pos;
+
+ return true;
+}
+
+// transfer indicator to variant
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if(cluster_id >= variants_by_cluster.size()) break;
+ //dout << cluster_id << endl;
+ //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+ vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+ if(vi_list.size() <= 1) continue;
+ // create variant_list from vi_list;
+
+ for(int t = 0; t < threshold_num; t++){
+
+ double quality_threshold = threshold_list[t];
+
+ vector<DiploidVariant> variant_list;
+ int chr_id = -1;
+ for(int i = 0; i < vi_list.size(); i++){
+ VariantIndicator vi = vi_list[i];
+ chr_id = vi.chr_id;
+ int var_id = vi.var_id;
+ DiploidVariant var;
+ if(vi.refer){
+ var = ref_variant_by_chrid[chr_id]->at(var_id);
+ }else{
+ var = que_variant_by_chrid[chr_id]->at(var_id);
+ }
+ if(var.qual < quality_threshold) continue;
+ variant_list.push_back(var);
+ }
+ if(chr_id == -1 || chr_id >= chrom_num){
+ cout << "[VarMatch] Error in matching single cluster" << endl;
+ continue;
+ }
+
+ MatchVariantListInThread(thread_index,
+ t,
+ chr_id,
+ variant_list,
+ cluster_id);
+
+ }
+
+ //if(method1 != method2){
+ // cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+ //}
+
+ }
+ return true;
+}
+
+
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+int WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+ for(int i = 0; i < 2; i++){
+ if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ if(sp.choice_vector[var_index] <= MEANING_CHOICE_BOUND) return var_index;
+ }
+ // you need to make choices now
+ // if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+ // // no choice made at current pos
+ // return true;
+ // }
+ }
+ }
+ return -1;
+}
+
+// if match_mode == 1, i.e. variant match mode, only check one sequence
+// otherwise, check two sequences
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+{
+
+ if(match_mode == 0){
+ //bool equal_sequences = false;
+ // same ref position, same donor length, same donor sequence, keep
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+ sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ for(int i = 0; i < 2; i++){
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }else{
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ //for(int i = 0; i < 2; i++)
+ int i = 0;
+ {
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }
+}
+
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode,
+ int & variant_need_decision){
+ //-1 operation fail, path deleted
+ //0 operation succeed
+ //1 operation fail, need to make decision first, then extend
+ //2 path reached end, need to check if good
+
+ if(sp.reached_sync_num >= sync_points.size()) return -1;
+
+ int start_pos = sp.current_genome_pos + 1;
+ int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+
+ for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+
+ // before make decision, we need to check if the equal property still holds
+ int variant_need_decision_ = PathNeedDecision(sp, choices_by_pos, next_genome_pos);
+ if(variant_need_decision_ >= 0){
+
+ // check equal property
+ int statu = CheckPathEqualProperty(sp, match_mode);
+ if(statu == -1) return -1;
+ variant_need_decision = variant_need_decision_;
+ return 1; // need decision on next position
+ }
+
+ // else extend one nt
+ for(int i = 0; i < 4; i++){
+
+ if(match_mode == 1){
+ if(i%2 != 0) continue;
+ }
+
+ if(sp.string_sequences[i][next_genome_pos] == "."){
+ sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+ }else{
+ sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+ }
+ }
+ sp.current_genome_pos = next_genome_pos;
+ }
+
+ // reaches the end of end_pos
+ sp.reached_sync_num ++;
+
+ if(sp.reached_sync_num >= sync_points.size()){
+ // last sync point is the end of ref genome sequence
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+ sp.donor_sequences[1] == sp.donor_sequences[3]){
+ return 2;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+ //PrintPath(sp);
+ return -1;
+ }
+ }
+ return CheckPathEqualProperty(sp, match_mode);
+ // first try to converge, then extend
+
+}
+
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme){
+ int score = 0;
+ if(choice <= NOT_USE) return score;
+ if(score_unit == 0){
+ score = 1;
+ }else if(score_unit == 1){
+ if(match_mode == 0){
+ if(choice == -1){
+ score += ScoreEditDistance(dv, 0);
+ }else if(choice == -2){
+ score += ScoreEditDistance(dv, 1);
+ }else if(choice == 0){
+ score += ScoreEditDistance(dv, 0);
+ if(dv.multi_alts && !dv.zero_one_var){
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, 0);
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, choice);
+ }
+ }
+
+ if(score_scheme == 0){
+ return score;
+ }else if(score_scheme == 1 || score_scheme == 2){
+ if(dv.flag == false && score_scheme == 1){
+ return score;
+ }else if(dv.flag && score_scheme == 2){
+ return score;
+ }else{
+ return 0;
+ }
+ }
+}
+
+
+// this is the special function to calculate edit distance
+int WholeGenome::CalculateEditDistance(DiploidVariant & dv,
+ int choice,
+ int match_mode){
+ int score = 0;
+ if(choice <= NOT_USE) return score;
+
+ if(match_mode == 0){
+ if(choice == -1){
+ score += ScoreEditDistance(dv, 0);
+ }else if(choice == -2){
+ score += ScoreEditDistance(dv, 1);
+ }else if(choice == 0){
+ score += ScoreEditDistance(dv, 0);
+ if(dv.multi_alts && !dv.zero_one_var){
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, 0);
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += 2 * ScoreEditDistance(dv, choice);
+ }
+
+ return score;
+}
+
+
+// function no longer used, move to VariantMakeDecisionNoGenotype
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // to maintain existance
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){ //here do not have to change anything
+ alts[1] = var.alts[1];
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.multi_alts){ // here do not have to change anything
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ //for(int y = 0; y < 2; y++)
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+ int c = var_choice[x].second;
+ alts[0] = var.alts[c];
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ int y = 0;
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(k < alts[y].length()){
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+ }
+ // else change nothing
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ // hence k == ref.length()-1, the last position
+ if(k < alts[y].length()){
+ string alt_part = alts[y].substr(k, alts[y].length()-k);
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+bool WholeGenome::AppendChangedSp(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index,
+ int c)
+{
+ int pos = sp.current_genome_pos+1;
+
+ SequencePath path = sp;
+
+ if(c == NOT_USE){
+ path.choice_vector[variant_index] = c;
+ sequence_path_list.push_back(path);
+ return true;
+ }
+
+ pair<int, int> var_choice[2];
+ int x = 0;
+ int var_index = variant_index;
+ DiploidVariant var = variant_list[var_index];
+ if(var.flag) x = 1;
+ string ref = var.ref;
+ string alts[2];
+
+ if(c == -1){
+ alts[0] = ref;
+ alts[1] = var.alts[0];
+ }else if(c == -2){
+ alts[0] = ref;
+ alts[1] = var.alts[1];
+ }else if(c >= 0){
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ alts[1] = alts[0];
+
+ if(var.multi_alts && !var.zero_one_var){
+ // choose 1 or 0
+ alts[1] = var.alts[1- c];
+ }else{
+ // c is 0, choose 0 or -1
+ if(var.heterozygous) alts[1] = ref;
+ }
+ }else{
+ dout << "Unrecognized choice" << endl;
+ }
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ ToUpper(alts[1]);
+ for(int y = 0; y < 2; y++){
+ // iterate two alts
+ string alt = alts[y];
+ if(alt == ref) continue;
+ vector<string> alt_vector;
+ GenerateAltVector(ref, alt, alt_vector);
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+
+ if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+ }
+ // else changes nothing
+
+ }
+ // hence k == ref.length()-1, the last position
+ assert(k == ref.length()-1);
+ string alt_part = alt_vector[k];
+ if(alt_part.length() > 0){
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+
+ // choice made
+ path.choice_vector[variant_index] = c;
+ //dout << "after decision at variant " << variant_index << endl;
+ //PrintPath(path);
+ sequence_path_list.push_back(path);
+ return true;
+}
+
+
+// Question: when you make decision, do you also need to align?
+// Answer: No, as it makes no difference, so currently you can skip alignment
+bool WholeGenome::VariantMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index)
+{
+
+ int pos = sp.current_genome_pos+1;
+
+ int var_index = variant_index;
+ DiploidVariant var = variant_list[var_index];
+
+ // also this variant may not be used
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ NOT_USE);
+
+ int i = 0;
+ if(var.flag) i = 1;
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts && !var.zero_one_var){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+ int skiped_y = -1;
+ if(alts[1] == ref) skiped_y = 1;
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(y == skiped_y) continue;
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ //candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ 0);
+ }
+
+ if(var.heterozygous){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ skiped_y = -1;
+ if(alts[0] == ref) skiped_y = 0;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(skiped_y == y) continue;
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ if(var.multi_alts && !var.zero_one_var){
+ //candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ 1);
+ }else{
+ //candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ -1);
+ }
+ }
+ }
+
+ if(var.multi_alts && var.zero_one_var){
+ // here contains another two combinations alt1/ref and ref/alt1
+ alts[0] = var.alts[1];
+ alts[1] = ref;
+
+ choice_applicable = true;
+ int y = 0;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ {
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ //candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ 1);
+ }
+
+
+ alts[0] = ref;
+ alts[1] = var.alts[1];
+
+ choice_applicable = true;
+ y = 1;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ {
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ //candidate_choices[i].push_back(pair<int, int>(var_index, -2));
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ -2);
+ }
+ }
+
+}
+
+// no genotype means you only need to maintain one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::VariantMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index)
+{
+
+ int pos = sp.current_genome_pos+1;
+
+ int var_index = variant_index;
+ DiploidVariant var = variant_list[var_index];
+
+ // also this variant may not be used
+ AppendChangedSpNoGenotype(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ NOT_USE);
+
+ int i = 0;
+ if(var.flag) i = 1;
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts && !var.zero_one_var){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ int y = 0;
+ {
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ //candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ AppendChangedSpNoGenotype(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ 0);
+ }
+
+ // if variants is 0/1, then it does not make sense to apply reference, as it is the same as not_use
+ // if variants is 0/1 but contains multi alts, then should try another alt
+ // if variants is 1/2 , then should try another alt
+ // if variants is 1/1 or 2/2 then should not try another alt
+ // but here we do not care the phasing
+ // so as long as variant has multi_alts, use another alt
+
+ if(var.multi_alts){
+
+ //if it contains multi alts, then there is another choice, check if it is applicable
+
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ int y = 0;
+ {
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ AppendChangedSp(sp,
+ variant_list,
+ sequence_path_list,
+ reference_sequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ var_index,
+ 1);
+ }
+ }
+}
+
+
+bool WholeGenome::AppendChangedSpNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index,
+ int c)
+{
+ int pos = sp.current_genome_pos+1;
+ SequencePath path = sp;
+ if(c == NOT_USE){
+ path.choice_vector[variant_index] = c;
+ sequence_path_list.push_back(path);
+ return true;
+ }
+
+ pair<int, int> var_choice[2];
+ int x = 0;
+ int var_index = variant_index;
+ DiploidVariant var = variant_list[var_index];
+ if(var.flag) x = 1;
+ string ref = var.ref;
+ string alts[2];
+
+ if(c == 0 || c == 1){
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ }else{
+ dout << "Unrecognized choice" << endl;
+ }
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ int y = 0;
+
+ string alt = alts[y];
+ vector<string> alt_vector;
+ GenerateAltVector(ref, alt, alt_vector);
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+ }
+ // else changes nothing
+ }
+ // hence k == ref.length()-1, the last position
+ assert(k == ref.length()-1);
+ string alt_part = alt_vector[k];
+ if(alt_part.length() > 0){
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ // choice made
+ path.choice_vector[variant_index] = c;
+ //dout << "after decision at variant " << variant_index << endl;
+ //PrintPath(path);
+ sequence_path_list.push_back(path);
+ return true;
+}
+
+// this function is no longer used, because you can not make decison for one position at once,
+// there might be multiple variants in one position,
+// so a better way to do this is to make decision for one variant at a time
+// previously I just want to save some time, but ignore the multiple variant condition
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts && !var.zero_one_var){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+ int skiped_y = -1;
+ if(alts[1] == ref) skiped_y = 1;
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(y == skiped_y) continue;
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.heterozygous){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ skiped_y = -1;
+ if(alts[0] == ref) skiped_y = 0;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(skiped_y == y) continue;
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ if(var.multi_alts && !var.zero_one_var){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }else{
+ candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+ }
+ }
+ }
+
+ if(var.multi_alts && var.zero_one_var){
+ // here contains another two combinations alt1/ref and ref/alt1
+ alts[0] = var.alts[1];
+ alts[1] = ref;
+
+ choice_applicable = true;
+ int y = 0;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ {
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }
+
+
+ alts[0] = ref;
+ alts[1] = var.alts[1];
+
+ choice_applicable = true;
+ y = 1;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ {
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, -2));
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+// string temp_sequence = reference_sequence.substr(pos, 1);
+// path.string_sequences[x*2][pos] = temp_sequence;
+// path.string_sequences[x*2+1][pos] = temp_sequence;
+// }else{
+ // set score
+
+
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+
+ int c = var_choice[x].second;
+ if(c == -1){
+ alts[0] = ref;
+ alts[1] = var.alts[0];
+ }else if(c == -2){
+ alts[0] = ref;
+ alts[1] = var.alts[1];
+ }else{
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ alts[1] = alts[0];
+
+ if(var.multi_alts && !var.zero_one_var){
+ // choose 1 or 0
+ alts[1] = var.alts[1- c];
+ }else{
+ // c is 0, choose 0 or -1
+ if(var.heterozygous) alts[1] = ref;
+ }
+ }
+
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ ToUpper(alts[1]);
+ for(int y = 0; y < 2; y++){
+ // iterate two alts
+ string alt = alts[y];
+ vector<string> alt_vector;
+ GenerateAltVector(ref, alt, alt_vector);
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+
+ if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+ }
+ // else changes nothing
+
+ }
+ // hence k == ref.length()-1, the last position
+ assert(k == ref.length()-1);
+ string alt_part = alt_vector[k];
+ if(alt_part.length() > 0){
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ // choice made
+ dout << "after decision at pos " << pos << endl;
+ PrintPath(path);
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+bool WholeGenome::PathMakeDecisionBackup(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+void WholeGenome::PrintPath(SequencePath & sp){
+ cout << "- Sequence Path:" << endl;
+ cout << "@ String Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ for(int j = 0; j < sp.string_sequences[i].size(); j++){
+ cout << sp.string_sequences[i][j] << " ";
+ }
+ cout << endl;
+ }
+ cout << "@ Donor Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ cout << sp.donor_sequences[i] << endl;
+ }
+ cout << "@ Removable: " << sp.removable << endl;
+
+ for(int i = 0; i < sp.choice_vector.size(); i++){
+ cout << sp.choice_vector[i] << ",";
+ }
+ cout << endl;
+}
+
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int threshold_index){
+ //--------------for unit test------------------------------
+ //dout << variant_list.size() << endl;
+
+ //int chr_id = 0;
+ //-------------end unit test-------------------------------
+
+ //cout << chr_id << "," << cluster_index << "," << variant_list.size() << endl;
+
+ // so a legal sync_points vector contains at least two
+ // first is the end of variant, there should be at least one variant
+ // second is the end of subsequence, there should be at least one nt not influenced by a variant
+
+ list<SequencePath> current_path_list;
+ list<SequencePath> next_path_list;
+ SequencePath sp(subsequence.length(), variant_list.size());
+ SequencePath best_path = sp;
+ current_path_list.push_back(sp);
+ while(current_path_list.size() != 0){
+ bool reach_sync_point = true;
+ // extend path before reaches sync points
+ //cout << "\t" << current_path_list.size() << endl;
+ while(current_path_list.size() != 0){
+ SequencePath path = current_path_list.front();
+ current_path_list.pop_front();
+ //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+ //PrintPath(path);
+ int variant_need_decision = -1;
+ int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode, variant_need_decision);
+ //cout << variant_need_decision << endl;
+ //PrintPath(path);
+ if(is_extend == -1){
+ // discard path
+ continue;
+ }
+ else if(is_extend == 0){
+ next_path_list.push_back(path);
+ // here the path is supposed to reach the next sync point
+ // so it goes into next path list, and decrease the number of current path list
+ }else if(is_extend == 1){
+ if(match_mode == 0){
+ // PathMakeDecision(path,
+ // variant_list,
+ // choices_by_pos,
+ // current_path_list,
+ // subsequence,
+ // score_unit,
+ // match_mode,
+ // score_scheme);
+
+ VariantMakeDecision(path,
+ variant_list,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ variant_need_decision);
+ }else{
+ // PathMakeDecisionNoGenotype(path,
+ // variant_list,
+ // choices_by_pos,
+ // current_path_list,
+ // subsequence,
+ // score_unit,
+ // match_mode,
+ // score_scheme);
+
+ VariantMakeDecisionNoGenotype(path,
+ variant_list,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme,
+ variant_need_decision);
+ }
+ }else if(is_extend == 2){
+ if(path.score > best_path.score){
+ best_path = path; // only when you reach the very end can you be considered as best path
+ //PrintPath(best_path);
+ }
+ }
+ }
+ current_path_list = next_path_list;
+ next_path_list.clear();
+ if(current_path_list.size() > 0){
+ //int current_genome_pos = current_path_list.front().current_genome_pos;
+ // after revise, we do not need this check
+ //if(sync_points.find(current_genome_pos) != sync_points.end()){
+ //dout << "converge paths at position: " << current_genome_pos << endl;
+ //dout << "before converge: " << current_path_list.size() << endl;
+ ConvergePaths(current_path_list);
+ //dout << "after converge: " << current_path_list.size() << endl;
+ //}
+ }
+ }
+ current_path_list.clear();
+ next_path_list.clear();
+ // print best_path
+ if(best_path.score <= 0) return false;
+
+ //dout << "new method: " << best_path.score << endl;
+
+ //PrintPath(best_path);
+
+ //==========================output ======================
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+
+ //return true;
+ if(match_mode == 0){
+ ConstructMatchRecord(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index,
+ threshold_index);
+ }else{
+ ConstructMatchRecordNoGenotype(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index,
+ threshold_index);
+ }
+ return true;
+}
+
+int GetMatchmodeFromModeIndex(int mode_index){
+ int result = mode_index;
+ result >>= 2;
+ result &= 1;
+ return result;
+}
+
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+ int truth_edit_distance = 0;
+ int predict_edit_distance = 0;
+
+ bool need_match_record = false;
+
+ if (threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+
+ if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[1];
+
+ int parsimonious_pos = offset;
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (int var_index = 0; var_index < variant_list.size(); var_index++) {
+ DiploidVariant variant = variant_list[var_index];
+ if(variant.flag != i) continue;
+ //The exact wording from the C++ standard is (§4.7/4): "If the source type is bool,
+ // the value false is converted to zero and the value true is converted to one."
+ int phasing = best_path.choice_vector[var_index];
+ if(phasing <= NOT_USE) continue;
+ int edit_distance = CalculateEditDistance(variant, phasing, 0);
+ if(i == 0){
+ truth_num ++;
+ truth_edit_distance += edit_distance;
+ }else{
+ predict_num ++;
+ predict_edit_distance += edit_distance;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts && !variant.zero_one_var){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts && !variant.zero_one_var){
+ phasing_string += "2|1";
+ }else if(variant.multi_alts && variant.zero_one_var){
+ phasing_string += "2|0";
+ }else{
+ phasing_string += "0|1";
+ }
+ }else if(phasing == -1){
+ phasing_string += "0|1";
+ }else if(phasing == -2){
+ phasing_string += "0|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+
+
+ // this line should be recovered
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+
+
+
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+
+ baseline_total_edit_distance[thread_index][threshold_index]->at(mode_index) += truth_edit_distance;
+ query_total_edit_distance[thread_index][threshold_index]->at(mode_index) += predict_edit_distance;
+}
+
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ int truth_edit_distance = 0;
+ int predict_edit_distance = 0;
+
+ bool need_match_record = false;
+ if(threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[0];
+
+ int parsimonious_pos = offset;
+
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ //if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (int var_index = 0; var_index < variant_list.size(); var_index++) {
+ DiploidVariant variant = variant_list[var_index];
+ if(variant.flag != i) continue;
+ //The exact wording from the C++ standard is (§4.7/4): "If the source type is bool,
+ // the value false is converted to zero and the value true is converted to one."
+ int phasing = best_path.choice_vector[var_index];
+ if(phasing <= NOT_USE) continue;
+ int edit_distance = CalculateEditDistance(variant, phasing, 1);
+ if(i == 0){
+ truth_num ++;
+ truth_edit_distance += edit_distance;
+ }else{
+ predict_num ++;
+ predict_edit_distance += edit_distance;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1|1";
+ }else if(phasing == 1){
+ phasing_string += "2|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+ //complex_match_records[thread_index]->push_back(match_record);
+ // this line should be recovered
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+
+ baseline_total_edit_distance[thread_index][threshold_index]->at(mode_index) += truth_edit_distance;
+ query_total_edit_distance[thread_index][threshold_index]->at(mode_index) += predict_edit_distance;
+}
+
+// function no longer used, backup old method
+void WholeGenome::ConstructMatchRecordBackup(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool need_match_record = false;
+
+ if (threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+
+ if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[1];
+
+ int parsimonious_pos = offset;
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts && !variant.zero_one_var){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts && !variant.zero_one_var){
+ phasing_string += "2|1";
+ }else if(variant.multi_alts && variant.zero_one_var){
+ phasing_string += "2|0";
+ }else{
+ phasing_string += "0|1";
+ }
+ }else if(phasing == -1){
+ phasing_string += "0|1";
+ }else if(phasing == -2){
+ phasing_string += "0|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+
+ // this line should be recovered
+
+
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+
+
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+}
+
+void WholeGenome::ConstructMatchRecordNoGenotypeBackup(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool need_match_record = false;
+ if(threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[0];
+
+ int parsimonious_pos = offset;
+
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ //if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ if (phasing == -1) continue;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1|1";
+ }else if(phasing == 1){
+ phasing_string += "2|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+
+
+ // this line should be recovered
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+
+
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+}
+
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+ bool truth_same = false;
+ bool query_same = false;
+
+ if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+ truth_same = true;
+ }
+ else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+ truth_same = true;
+ }
+
+
+ if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+ query_same = true;
+ }
+ else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+ query_same = true;
+ }
+
+ if(truth_same && query_same) return true;
+ return false;
+}
+
+bool IsRemovable(SequencePath & s){ return s.removable;}
+
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+ //dout << "===========start converge===================" << endl;
+ int path_num = path_list.size();
+ if(path_num <= 1) return;
+ for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+ SequencePath ref_path = *i;
+ if(ref_path.removable) continue;
+ if(!ref_path.same_donor_len) continue;
+ list<SequencePath>::iterator j = i;
+ ++j;
+ for(; j != path_list.end(); ++j){
+ SequencePath que_path = *j;
+ if(que_path.removable) continue;
+ if(!que_path.same_donor_len) continue;
+ //dout << "Comparing following paths: " << endl;
+ //PrintPath(ref_path);
+ //PrintPath(que_path);
+ if(DonorLengthEqual(ref_path, que_path)){
+ if(ref_path.score >= que_path.score){
+ (*j).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*j));
+ }else{
+ (*i).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*i));
+ break;
+ }
+ }
+ //dout << "- - - - - - - - - -" << endl;
+ }
+ }
+
+ path_list.remove_if(IsRemovable);
+}
+
+int WholeGenome::test() {
+ genome_sequences[0] = "GTCAGCCGG";
+ DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+ DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+ DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+ DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+ DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+ DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+ DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+ DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+
+ //complex_match_records = new vector<string>*[1];
+ //complex_match_records[0] = new vector<string>;
+ //vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+ vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+ //cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+ //cout << complex_match_records[0]->at(0) << endl;
+ return 0;
+}
+
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+ int start = 0;
+ int cluster_number = variants_by_cluster.size(); // cluster number
+ int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+ int cluster_step = cluster_number / thread_num; // assign clusters to threads
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+ //initialize vector size
+ //complex_match_records = new vector<string>*[thread_num];
+ match_records_by_mode_by_thread = new vector<string>**[thread_num];
+
+ //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+ match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ match_records_by_mode_by_thread[i][j] = new vector<string>;
+ }
+ }
+
+ baseline_total_match_num = new vector<int>** [thread_num];
+ query_total_match_num = new vector<int> ** [thread_num];
+
+ baseline_total_edit_distance = new vector<int>** [thread_num];
+ query_total_edit_distance = new vector<int>** [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+
+ baseline_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+ query_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+
+ baseline_total_edit_distance[i] = new vector<int> * [ROC_SAMPLE_NUM];
+ query_total_edit_distance[i] = new vector<int>* [ROC_SAMPLE_NUM];
+
+ for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+ baseline_total_match_num[i][j] = new vector<int>;
+ baseline_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+ query_total_match_num[i][j] = new vector<int>;
+ query_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+
+ baseline_total_edit_distance[i][j] = new vector<int>;
+ baseline_total_edit_distance[i][j]->resize(MATCH_MODE_NUM, 0);
+ query_total_edit_distance[i][j] = new vector<int>;
+ query_total_edit_distance[i][j]->resize(MATCH_MODE_NUM, 0);
+ }
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= variants_by_cluster.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ ClusteringMatchInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ //output all results
+ cout << "writing results..." << endl;
+ ofstream output_stat_file;
+ output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+
+ cout << "=========VarMatch Result Stat.=======" << endl;
+ string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tqual_threshold\tbaseline_match_num\tquery_match_num\tquery_total_num\tbaseline_total_ED\tquery_total_ED";
+ cout << stat_head_string << endl;
+ output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+ output_stat_file << "##Query:"<< query_variant_total_num << endl;
+ output_stat_file << stat_head_string << endl;
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ int total_ref_complex = 0;
+ int total_que_complex = 0;
+
+ string threshold_string = "";
+ string baseline_match_num_string = "";
+ string query_match_num_string = "";
+ string query_total_num_string = "";
+
+ string baseline_edit_distance_string = "";
+ string query_edit_distance_string = "";
+
+ for(int t = 0; t < threshold_num; t++){
+
+ threshold_string += to_string(threshold_list[t]);
+
+ int baseline_match_num_by_threshold_by_mode = 0;
+ int query_match_num_by_threshold_by_mode = 0;
+
+ int baseline_edit_distance_by_threshold_by_mode = 0;
+ int query_edit_distance_by_threshold_by_mode = 0;
+
+ for(int i = 0; i < thread_num; i++){
+ baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i][t]->at(mode_index);
+ query_match_num_by_threshold_by_mode += query_total_match_num[i][t]->at(mode_index);
+
+ baseline_edit_distance_by_threshold_by_mode += baseline_total_edit_distance[i][t]->at(mode_index);
+ query_edit_distance_by_threshold_by_mode += query_total_edit_distance[i][t]->at(mode_index);
+ }
+
+ baseline_match_num_string += to_string(baseline_match_num_by_threshold_by_mode);
+ query_match_num_string += to_string(query_match_num_by_threshold_by_mode);
+ query_total_num_string += to_string((int)(query_variant_total_num * (1-per_list[t])) );
+
+ baseline_edit_distance_string += to_string(baseline_edit_distance_by_threshold_by_mode);
+ query_edit_distance_string += to_string(query_edit_distance_by_threshold_by_mode);
+
+ if(t < threshold_num-1){
+ threshold_string += ",";
+ baseline_match_num_string += ",";
+ query_match_num_string += ",";
+ query_total_num_string += ",";
+ baseline_edit_distance_string += ",";
+ query_edit_distance_string += ",";
+ }
+
+ }
+
+ string total_match_num_string = to_string(score_unit) + "\t" +
+ to_string(match_mode) + "\t" +
+ to_string(score_scheme) + "\t" +
+ threshold_string + "\t" +
+ baseline_match_num_string + "\t" +
+ query_match_num_string + "\t" +
+ query_total_num_string;// + "\t" + to_string(mode_index);
+ cout << total_match_num_string << "\t" << baseline_edit_distance_string << "\t" << query_edit_distance_string << endl;;
+ output_stat_file << total_match_num_string << endl;
+ }
+ }
+ }
+ output_stat_file.close();
+
+ int bench_mode_index = GetIndexFromMatchScore(0, 0, 0);
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+
+ output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+ output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+
+ for(int i = 0; i < thread_num; i++){
+ for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+ if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+ //if(match_records_by_mode_by_thread[i][mode_index]->at(k)[0] == '$'){
+ //int bench_mode_index = stoi(match_records_by_mode_by_thread[i][mode_index]->at(k).erase(0,1));
+ //output_complex_file << match_records_by_mode_by_thread[i][0]->at(k);
+ //}else{
+ output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+ //}
+ }
+ }
+ }
+ output_complex_file.close();
+ }
+ }
+ }
+
+ // clear all matching records
+ for(int i = 0; i < thread_num; i++){
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ delete match_records_by_mode_by_thread[i][j];
+
+ }
+ for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+ delete baseline_total_match_num[i][j];
+ delete query_total_match_num[i][j];
+
+ delete baseline_total_edit_distance[i][j];
+ delete query_total_edit_distance[i][j];
+ }
+ delete[] match_records_by_mode_by_thread[i];
+ delete[] baseline_total_match_num[i];
+ delete[] query_total_match_num[i];
+
+ delete[] baseline_total_edit_distance[i];
+ delete[] query_total_edit_distance[i];
+ }
+ delete[] match_records_by_mode_by_thread;
+ delete[] baseline_total_match_num;
+ delete[] query_total_match_num;
+
+ delete[] baseline_total_edit_distance;
+ delete[] query_total_edit_distance;
+
+}
+
+//[TODO] unit test
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+
+ int left_index = pos;
+ if (genome_sequences[chr_id].size() == 0) return -1;
+ if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+ parsimonious_ref.pop_back();
+ parsimonious_alt0.pop_back();
+ parsimonious_alt1.pop_back();
+ change_in_allels = true;
+ }
+ // else do not make further changes
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+ left_index--;
+ char left_char = toupper(genome_sequences[chr_id][left_index]);
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt0 = left_char + parsimonious_alt0;
+ parsimonious_alt1 = left_char + parsimonious_alt1;
+ }
+ }
+ while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+ parsimonious_ref[0] == parsimonious_alt1[0] &&
+ parsimonious_ref.size() > 1 &&
+ parsimonious_alt0.size() > 1 &&
+ parsimonious_alt1.size() > 1)
+ {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt0.erase(0, 1);
+ parsimonious_alt1.erase(0, 1);
+ left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+ }
+ return left_index;
+}
+
+void WholeGenome::SingleThreadClustering(int chr_id) {
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = 0;
+ int c_end = 0;
+ sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+ sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+ int ref_size = ref_variant_by_chrid[chr_id]->size();
+ int que_size = que_variant_by_chrid[chr_id]->size();
+ //dout << chr_id << "," << ref_size << "," << que_size << endl;
+
+ int ref_index = 0;
+ int que_index = 0;
+ bool not_first = false;
+ DiploidVariant snp;
+ vector<VariantIndicator> vi_list;
+ while (ref_index < ref_size || que_index < que_size) {
+ bool take_que = true;
+ if(ref_index < ref_size && que_index < que_size){
+ if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+ take_que = false;
+ }
+ }else if(ref_index < ref_size){
+ take_que = false;
+ }
+ int var_index;
+ if(take_que){
+
+ snp = que_variant_by_chrid[chr_id]->at(que_index);
+ //cout << "q |" << que_index << "," << snp.pos << endl;
+ var_index = que_index;
+ que_index++;
+ }else{
+ snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+ //cout << "r |" << ref_index << "," << snp.pos << endl;
+ var_index = ref_index;
+ ref_index++;
+ }
+ // check if need to separator clusters
+ if (not_first) {
+ c_end = snp.pos;
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+ }
+
+ if(separate_cluster){
+ variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+ vi_list.clear();
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+ VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+ vi_list.push_back(current_variant_indicator);
+ //cluster_vars_map[cluster_index].push_back(snp);
+ if(!not_first) not_first = true;
+ int ref_length = (int)(snp.ref.length());
+ int flag = 0;
+ if(snp.flag) flag = 1;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ ins_len[flag] += snp.mil;
+ del_len[flag] += snp.mdl;
+ }
+ if(vi_list.size() > 0){
+ variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+ }
+}
+
+int WholeGenome::ReadReferenceVariants(string filename){
+ return ReadWholeGenomeVariant(filename, false);
+}
+
+int WholeGenome::ReadQueryVariants(string filename){
+ return ReadWholeGenomeVariant(filename, true);
+}
+
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+
+ ReadWholeGenomeSequence(genome_seq);
+ baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+ ref_vcf_filename = ref_vcf;
+
+}
+
+void WholeGenome::Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_)
+{
+ // initialize query variant data structure
+
+
+ que_vcf_filename = query_vcf;
+
+ this->output_prefix = output_prefix;
+ this->detail_results = detail_results;
+
+ score_unit_indicator = score_unit_;
+ match_mode_indicator = match_mode_;
+ score_scheme_indicator = score_scheme_;
+
+ if(score_scheme_indicator == 3){
+ DirectMatch(ref_vcf_filename, query_vcf, match_mode_, output_prefix);
+ return;
+ }
+
+ query_variant_total_num = ReadQueryVariants(query_vcf);
+
+ if(score_unit_indicator == -1){
+ score_unit_list.push_back(0);
+ score_unit_list.push_back(1);
+ }else{
+ score_unit_list.push_back(score_unit_indicator);
+ }
+
+ if(match_mode_indicator == -1){
+ match_mode_list.push_back(0);
+ match_mode_list.push_back(1);
+ }else{
+ match_mode_list.push_back(match_mode_indicator);
+ }
+
+ if(score_scheme_indicator == -1){
+ score_scheme_list.push_back(0);
+ score_scheme_list.push_back(1);
+ score_scheme_list.push_back(2);
+ }else{
+ score_scheme_list.push_back(score_scheme_indicator);
+ }
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ for(int j = 0; j < match_mode_list.size(); j++){
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+ mode_index_list.push_back(mode_index); // so that I can directly know how many mode, do not need to calculate all the time
+ }
+ }
+ }
+
+ cout << "Baseline VCF: " << ref_vcf_filename << endl;
+ cout << "Query VCF: " << query_vcf << endl;
+ cout << "========VCF Stat.==========" << endl;
+ cout << "Total Number of VCF Entries: " << endl;
+ cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+
+ cout << "parallel clustering..." << endl;
+ ParallelClustering();
+
+ cout << "matching variants..." << endl;
+ ClusteringMatchMultiThread();
+
+ // most clustering results are cleared inside ParallelClustering function except the following one
+ // which is needed for matching
+ variants_by_cluster.clear();
+ // clean at the end of function
+ for(int j = 0; j < chrom_num; j++){
+ que_variant_by_chrid[j]->clear();
+ //delete que_variant_by_chrid[j];
+ }
+ //delete[] que_variant_by_chrid;
+
+ query_variant_strings.clear();
+ query_variant_total_num = 0;
+ threshold_list.clear();
+ threshold_num = 0;
+ // The following three matching results are cleared inside ClusteringMatchMultiThread function
+ // match_records_by_mode_by_thread;
+ // baseline_total_match_num;
+ // query_total_match_num;
+
+ score_unit_list.clear();
+ match_mode_list.clear();
+ score_scheme_list.clear();
+ mode_index_list.clear();
+
+ return;
+}
+
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf, int match_mode_, string output_prefix)
+{
+ //dout << "direct match" << endl;
+ match_mode_indicator = match_mode_;
+ //int ref_variant_num = ReadReferenceVariants(ref_vcf);
+ int que_variant_num = ReadQueryVariants(query_vcf);
+ //dout << que_variant_num << endl;
+ int match_num = 0;
+ ofstream output_stat_file;
+ output_stat_file.open(output_dir + "/" + output_prefix+".direct");
+ for(int i = 0; i < chrom_num; i++){
+ if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+ continue;
+ //[TODO] not the right way to do it, at least need multimap
+ multimap<int, int> ref_variant_by_pos;
+ for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ ref_variant_by_pos.insert(pair<int, int>(pos, j));
+ }
+
+ for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = que_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+ continue;
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = ref_variant_by_pos.equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int ref_index = (*it).second;
+ DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+ if (match_mode_indicator != 1 && var == ref_var){
+ match_num ++;
+ string matched_variant = chrname_by_chrid[i] + "\t" + to_string(ref_var.pos) + "\t" + ref_var.ref + "\t";
+ output_stat_file << matched_variant << endl;
+ break;
+ }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+ match_num ++;
+ break;
+ }
+ }
+ }
+ }
+ output_stat_file.close();
+ dout << "matched variants: " << match_num << endl;
+}
diff --git a/src/wholegenome.h b/src/wholegenome.h
new file mode 100644
index 0000000..2817ee9
--- /dev/null
+++ b/src/wholegenome.h
@@ -0,0 +1,367 @@
+#pragma once
+
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+
+typedef struct VariantIndicator{
+ VariantIndicator(int chr_id_ = -1,
+ int var_id_ = -1,
+ bool refer_ = true) :
+ chr_id(chr_id_),
+ var_id(var_id_),
+ refer(refer_){}
+
+ char chr_id;
+ int var_id;
+ bool refer;
+}VariantIndicator;
+
+typedef struct Interval {
+ int start;
+ int end;
+ Interval() : start(0), end(0) {}
+ Interval(int s, int e) : start(s), end(e) {}
+}Interval;
+
+class SequencePath{
+public:
+ SequencePath(int n, int v)
+ {
+ reference_length = n;
+ for(int i = 0; i < 4; i++){
+ string_sequences[i].resize(n, ".");
+ // default value is "."
+ donor_sequences[i] = "";
+ }
+ current_genome_pos = -1;
+ score = 0;
+ removable = false;
+ same_donor_len = false;
+ current_equal_donor_pos[0] = -1;
+ current_equal_donor_pos[1] = -1;
+ reached_sync_num = 0;
+
+ for(int i = 0; i < v; i++){
+ choice_vector.push_back(-89);
+ }
+ }
+ int reference_length;
+ vector<string> string_sequences[4];
+ map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+ // one choice is a pair: variant id, phasing index
+ int current_genome_pos;
+ string donor_sequences[4];
+ int current_equal_donor_pos[2];
+ int score;
+ bool removable;
+ bool same_donor_len;
+ int reached_sync_num;
+ vector<int> choice_vector;
+};
+
+class WholeGenome{
+private:
+ int chrom_num;
+ int thread_num;
+ string ref_vcf_filename;
+ string que_vcf_filename;
+ int baseline_variant_total_num;
+ int query_variant_total_num;
+ vector<string> baseline_variant_strings;
+ vector<string> query_variant_strings;
+ bool detail_results;
+
+ //int thread_num; VCF->DiploidVariant->WholeGenome
+protected:
+ map<string, int> chrid_by_chrname;
+ map<int, string> chrname_by_chrid;
+ map<string, int> chrname_dict;
+ map<int, string> genome_sequences;
+ vector<DiploidVariant> ** ref_variant_by_chrid;
+ vector<DiploidVariant> ** que_variant_by_chrid;
+ vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+ // so here cluster is represented as vector<vector<VariantIndicator>>
+ // and we create a list of pointers point to cluster
+ // and we hold the point to that list
+
+ vector<vector<VariantIndicator>> variants_by_cluster;
+
+ vector<string> *** match_records_by_mode_by_thread;
+
+ //vector<int> *** baseline_matches_by_mode_by_thread;
+ //vector<int> *** query_matches_by_mode_by_thread;
+ vector<int> *** baseline_total_match_num;
+ vector<int> *** query_total_match_num;
+
+ vector<int> *** baseline_total_edit_distance;
+ vector<int> *** query_total_edit_distance;
+
+ //map<float, int> *** tp_qual_num_by_mode_by_thread;
+ //map<float, int> *** fp_qual_num_by_mode_by_thread;
+
+ //map<float, int> query_total_qual_num;
+
+ string output_prefix;
+ string output_dir;
+ // copy the above into this.
+
+ int score_unit_indicator;
+ int match_mode_indicator;
+ int score_scheme_indicator;
+
+ vector<int> score_unit_list;
+ vector<int> match_mode_list;
+ vector<int> score_scheme_list;
+ vector<int> mode_index_list;
+
+ vector<double> threshold_list;
+ int threshold_num;
+
+ vector<float> per_list;
+
+ bool ReadWholeGenomeSequence(string filename);
+ bool ReadGenomeSequenceList(string filename);
+ int ReadWholeGenomeVariant(string filename, bool flag);
+ bool ReadVariantFileList(string filename);
+ int ReadReferenceVariants(string filename);
+ int ReadQueryVariants(string filename);
+ bool ParallelClustering(); // parallel by chr id
+ bool ParallelMatching(); // parallel by task
+ bool TBBMatching();
+
+ void SingleThreadClustering(int chr_id);
+ //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+
+ //override
+ bool ClusteringMatchInThread(int start, int end, int thread_index);
+ void ClusteringMatchMultiThread();
+ int NormalizeVariantSequence(int pos,
+ string & parsimonious_ref,
+ string & parsimonious_alt0,
+ string & parsimonious_alt1,
+ int chr_id);
+
+ struct compInterval {
+ bool operator()(const Interval &a, const Interval &b) const {
+ return a.start<b.start;
+ }
+ };
+
+ vector<Interval> merge(vector<Interval> &intervals) {
+ sort(intervals.begin(),intervals.end(),compInterval());
+ vector<Interval> results;
+ for(int i=0; i<intervals.size(); i++) {
+ if(results.empty() || results.back().end < intervals[i].start) // no overlap
+ results.push_back(intervals[i]);
+ else // overlap
+ results.back().end = max(results.back().end, intervals[i].end);
+ }
+ return results;
+ }
+
+ int PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+ int PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode,
+ int & variant_need_decision);
+
+ bool PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool VariantMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index);
+
+ bool VariantMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index);
+
+ bool AppendChangedSp(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index,
+ int c);
+
+ bool AppendChangedSpNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int variant_index,
+ int c);
+
+ bool PathMakeDecisionBackup(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int threshold_index);
+
+ bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+ void ConvergePaths(list<SequencePath> & path_list);
+ int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+
+ int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+ int EditDistance(const std::string& s1, const std::string& s2);
+ bool PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ void ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ void ConstructMatchRecordBackup(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ void ConstructMatchRecordNoGenotypeBackup(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ int CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+ bool ClearQuery();
+
+ inline void ToUpper(string & s){
+ transform(s.begin(), s.end(), s.begin(), ::toupper);
+ }
+
+ bool CheckTandemRepeat(string sequence, int unit_threshold);
+
+ bool MatchVariantListInThread(int thread_index,
+ int threshold_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id);
+
+ void initialize_score_matrix(int **score, char **trackBack, int M, int N);
+ int needleman_wunsch(string S1, string S2, string &R1, string &R2);
+ void GenerateAltVector(string ref, string alt, vector<string> & alt_vector);
+
+ int CalculateEditDistance(DiploidVariant & dv,
+ int choice,
+ int match_mode);
+
+public:
+ WholeGenome(int thread_num_,
+ string output_dir_,
+ bool pr_curves_);
+
+ ~WholeGenome();
+
+ void ReadRef(string genome_seq,
+ string ref_vcf);
+
+ void Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_);
+
+ void DirectMatch(string ref_vcf,
+ string query_vcf,
+ int match_mode_,
+ string output_prefix);
+
+ int test(); // for direct test
+ void PrintPath(SequencePath & sp);
+
+ const static int MATCH_MODE_NUM = 16;
+ const static int VAR_LEN = 100;
+ const static int MAX_REPEAT_LEN = 1000;
+ const static int ROC_SAMPLE_NUM = 5;
+ const static int MEANING_CHOICE_BOUND = -10;
+ const static int NOT_USE = -9;
+ const static int EASY_MATCH_VAR_NUM = 5;
+};
diff --git a/src/wholegenome_backup.cpp b/src/wholegenome_backup.cpp
new file mode 100644
index 0000000..fbfe718
--- /dev/null
+++ b/src/wholegenome_backup.cpp
@@ -0,0 +1,2056 @@
+#include "wholegenome.h"
+
+using namespace std;
+
+WholeGenome::WholeGenome(int thread_num_,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_,
+ string output_dir_){
+
+ thread_num = thread_num_;
+ chrom_num = 24;
+
+ //thread_num = thread_num_;
+ score_unit_indicator = score_unit_;
+ match_mode_indicator = match_mode_;
+ score_scheme_indicator = score_scheme_;
+
+ output_dir = output_dir_;
+
+ if(score_unit_indicator == -1){
+ score_unit_list.push_back(0);
+ score_unit_list.push_back(1);
+ }else{
+ score_unit_list.push_back(score_unit_indicator);
+ }
+
+ if(match_mode_indicator == -1){
+ match_mode_list.push_back(0);
+ match_mode_list.push_back(1);
+ }else{
+ match_mode_list.push_back(match_mode_indicator);
+ }
+
+ if(score_scheme_indicator == -1){
+ score_scheme_list.push_back(0);
+ score_scheme_list.push_back(1);
+ score_scheme_list.push_back(2);
+ }else{
+ score_scheme_list.push_back(score_scheme_indicator);
+ }
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ for(int j = 0; j < match_mode_list.size(); j++){
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+ mode_index_list.push_back(mode_index); // so that I can directly know how many mode, do not need to calculate all the time
+ }
+ }
+ }
+
+ //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+
+ ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+
+ for (int j = 0; j < chrom_num; j++) {
+ ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ // chr_id starts from 0
+ for(int j = 1; j <= 22; j++){
+ string chr_name = to_string(j);
+ chrname_dict[chr_name] = j-1;
+ chr_name = "chr"+chr_name;
+ chrname_dict[chr_name] = j-1;
+ }
+ chrname_dict["X"] = 22;
+ chrname_dict["chrX"] = 22;
+ chrname_dict["Y"] = 23;
+ chrname_dict["chrY"] = 23;
+
+}
+
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+ int result = 0;
+ result |= score_unit & 1;
+ result <<= 1;
+ result |= match_mode & 1;
+ result <<= 2;
+ result |= score_scheme & 3;
+ return result;
+}
+
+WholeGenome::~WholeGenome(){
+
+ for(int j = 0; j < chrom_num; j++){
+ ref_variant_by_chrid[j]->clear();
+ delete ref_variant_by_chrid[j];
+ }
+ delete[] ref_variant_by_chrid;
+}
+
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+ std::ifstream input(filename);
+ if(!input.good()){
+ std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+ return false;
+ }
+
+ std::string line, name, content;
+ int real_chrom_num = 0;
+ while( std::getline( input, line ).good() ){
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ int chr_id = chrname_dict[name];
+ chrid_by_chrname[name] = chr_id;
+ chrname_by_chrid[chr_id] = name;
+ genome_sequences[chr_id] = content;
+ real_chrom_num++;
+ name.clear();
+ }
+ if( !line.empty() ){
+ name = split(line, ' ')[0].substr(1);
+ }
+ content.clear();
+ } else if( !name.empty() ){
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+ name.clear();
+ content.clear();
+ } else {
+ content += line;
+ }
+ }
+ }
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ int chr_id = chrname_dict[name];
+ chrid_by_chrname[name] = chr_id;
+ chrname_by_chrid[chr_id] = name;
+ genome_sequences[chr_id] = content;
+ real_chrom_num++;
+ }
+ // test
+
+ chrom_num = real_chrom_num;
+ //dout << "detected chromosome num: " << chrom_num << endl;
+// for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+// cout << it->first << ":" << (it->second).length();
+// }
+ return true;
+}
+
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+
+}
+
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+ int total_num = 0;
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return -1;
+ }
+
+ map<int, int> quality_num;
+
+ int genotype_index = -1;
+ char genotype_separator = '/';
+ //int genome_sequence_length = genome_sequence.length();
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ //if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#') {
+ continue;
+ }
+ auto columns = split(line, '\t');
+ if (columns.size() < 10) {
+ if(match_mode_indicator != 1){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+ match_mode_indicator = 1;
+ continue;
+ }
+ if(columns.size() < 6){
+ cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+ cout << "[VarMatch] skip current variant: " << line << endl;
+ continue;
+ }
+ }
+ string chr_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+
+ auto ref = columns[3];
+ auto alt_line = columns[4];
+ float quality = stof(columns[5]);
+
+ if(flag){
+ int quality_int = (int) quality;
+ //dout << quality_int << endl;
+ if(quality_que_totalnum.find(quality_int) != quality_que_totalnum.end()){
+ quality_que_totalnum[quality_int] += 1.0;
+ }else{
+ quality_que_totalnum[quality_int] = 1.0;
+ }
+
+ }
+
+ ToUpper(ref);
+ ToUpper(alt_line);
+
+ bool is_heterozygous_variant = false;
+ bool is_multi_alternatives = false;
+
+ if (columns.size() >= 10) {
+ if (genotype_index < 0) {
+ auto formats = split(columns[8], ':');
+ for (int i = 0; i < formats.size(); i++) {
+ if (formats[i] == "GT") {
+ genotype_index = i;
+ break;
+ }
+ }
+ if(genotype_index < 0){
+ cout << "[VarMatch] VCF entry does not contain genotype information" << endl;
+ continue;
+ }
+ }
+ auto additionals = split(columns[9], ':');
+ vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+
+ if(genotype_columns.size() != 2){
+ genotype_separator = '|';
+ genotype_columns = split(additionals[genotype_index], genotype_separator);
+ }
+
+ // normalize format of genotype: sorted, separated by |
+ if (genotype_columns.size() != 2) {
+ cout << "[VarMatch] Warning Unrecognized Genotype: " << additionals[genotype_index] << endl;
+ continue;
+ }
+ else {
+ if (genotype_columns[0] != genotype_columns[1]) {
+ is_heterozygous_variant = true;
+ }
+ }
+
+ if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+ continue;
+ }
+ }
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos) {
+ alt_list = split(alt_line, ',');
+ is_multi_alternatives = true;
+ }
+ else {
+ alt_list.push_back(alt_line);
+ }
+
+ int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+ int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+ if(is_multi_alternatives){
+ snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+ snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+ }
+
+ if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+ //dout << "[VarMatch] skip large INDEL with length > 50 bp" << endl;
+ continue;
+ }
+
+ DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality);
+ //if (normalization) {
+ //NormalizeDiploidVariant(dv);
+ //}
+ if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+ int chr_id = chrid_by_chrname[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }else{
+ int chr_id = chrname_dict[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }
+
+ total_num++;
+ }
+ vcf_file.close();
+
+ return total_num;
+}
+
+bool WholeGenome::ReadVariantFileList(string filename){
+
+}
+
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+ return EditDistance(dv.ref, dv.alts[allele_indicator]);
+}
+
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+{
+ const std::size_t len1 = s1.size(), len2 = s2.size();
+ std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+
+ for (unsigned int i = 0; i < prevCol.size(); i++)
+ prevCol[i] = i;
+ for (unsigned int i = 0; i < len1; i++) {
+ col[0] = i+1;
+ for (unsigned int j = 0; j < len2; j++)
+ // note that std::min({arg1, arg2, arg3}) works only in C++11,
+ // for C++98 use std::min(std::min(arg1, arg2), arg3)
+ col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+ col.swap(prevCol);
+ }
+ return prevCol[len2];
+}
+
+bool WholeGenome::ParallelClustering(){
+ // parallel by chr
+ variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+ }
+
+ int parallel_steps = chrom_num / thread_num;
+ if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+ int chr_id = 0;
+ for(int i = 0; i < parallel_steps; i++){
+ vector<thread> threads;
+ for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+ }
+ chr_id ++;
+ }
+ if(chr_id < chrom_num){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ SingleThreadClustering(chr_id);
+ }
+ chr_id ++;
+ }
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+ threads.clear();
+ }
+
+
+ for(int i = 0; i < chrom_num; i++){
+ variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+ }
+
+ // test output
+ //dout << endl;
+ map<int, int> size_num;
+ map<int, int> size_chrid;
+ for(int i = 0; i < chrom_num; i++){
+ //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+ for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+ int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+ if(size_num.find(temp_size) != size_num.end()){
+ size_num[temp_size] ++;
+ }else{
+ size_num[temp_size] = 1;
+ }
+ if(size_chrid.find(temp_size) == size_chrid.end()){
+ size_chrid[temp_size] = i;
+ }
+ }
+ }
+
+ //cout << endl;
+ //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+ // dout << it->first << ": " << it->second << endl;
+ //}
+
+// cout << endl;
+// cout << "size and location:" << endl;
+// for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+// dout << it->first << ": " << it->second << endl;
+// }
+ // clean at the end of function
+
+ for(int j = 0; j < chrom_num; j++){
+ variant_cluster_by_chrid[j]->clear();
+ delete variant_cluster_by_chrid[j];
+ }
+ delete[] variant_cluster_by_chrid;
+
+ return true;
+}
+
+bool WholeGenome::ParallelMatching(){
+
+}
+
+bool WholeGenome::TBBMatching()
+{
+
+}
+
+
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+ int sequence_length = (int)sequence.length();
+ //cout << sequence_length << "," << unit_threshold << endl;
+ if(sequence_length == 1) return true;
+ transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+ int end_index = sequence_length / 2 + 1;
+ bool final_checking = false;
+ int repeat_threshold = min(end_index-1, unit_threshold);
+ for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+ bool is_tandem_repeat = true;
+ int repeat_time = 1;
+ string repeat_region = sequence.substr(0, repeat_length);
+ int start_position = repeat_length;
+ while (start_position < sequence_length) {
+ if (start_position + repeat_length > sequence_length)
+ break;
+ string matching_region = sequence.substr(start_position, repeat_length);
+ if (matching_region != repeat_region) {
+ is_tandem_repeat = false;
+ break;
+ }
+ start_position += repeat_length;
+ repeat_time ++;
+ }
+ if (is_tandem_repeat && repeat_time > 1) {
+ final_checking = true;
+ break;
+ }
+ }
+ return final_checking;
+}
+
+bool WholeGenome::MatchVariantListInThread(int thread_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id){
+ //===================================================
+ sort(variant_list.begin(), variant_list.end());
+ // decide reference sequence
+ vector<DiploidVariant> separate_var_list[2];
+ vector<Interval> intervals;
+ // separate into ref and que
+ int total_mil = 0;
+ int total_mdl = 0;
+ int min_pos = genome_sequences[chr_id].length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = 0;
+ if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ total_mil += variant_list[i].mil;
+ total_mdl += variant_list[i].mdl;
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+
+ int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+ intervals.push_back(Interval(pos, end_pos));
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ // here we need to push back for all mode_index
+ //complex_match_records[thread_index]->push_back(match_record);
+ int qual = (int)(tv.qual);
+
+ for(int mi = 0; mi < mode_index_list.size(); mi ++){
+ int mode_i = mode_index_list[mi];
+ if(mi == 0){
+ match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+ }else{
+ match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$");
+ // use dollor to represent that it is the same
+ }
+ baseline_total_match_num[thread_index]->at(mode_i)++;
+ query_total_match_num[thread_index]->at(mode_i)++;
+ auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_i]->end();
+ if(quality_que_matchnum_by_thread_mode[thread_index][mode_i]->find(qual) != end_it){
+ quality_que_matchnum_by_thread_mode[thread_index][mode_i]->at(qual) += 1.0;
+ }else{
+ quality_que_matchnum_by_thread_mode[thread_index][mode_i]->at(qual) = 1.0;
+ }
+ }
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+ int flag = 0;
+ if(separate_var_list[1].size() == 1) flag = 1;
+ int r_flag = 1-flag;
+ if(separate_var_list[r_flag].size() > 4){
+ int total_r_mdl = 0;
+ int total_r_mil = 0;
+
+ for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+ DiploidVariant var = separate_var_list[r_flag][k];
+ int var_mdl = var.mdl;
+ int var_mil = var.mil;
+ int ref_length = var.ref.length();
+ total_r_mdl += var_mdl;
+ total_r_mil += var_mil;
+ }
+
+ if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+ }
+ }
+
+ // remove singular variant
+ // [todo] try removing this filter to see running time changes
+ vector<bool> appliable_flag;
+ int total_change = total_mil+total_mdl;
+
+ for(int k = 0; k < variant_list.size(); k++){
+ DiploidVariant cur_var = variant_list[k];
+ int max_change = max(cur_var.mil, cur_var.mdl);
+ if(max_change > total_change-max_change){
+ appliable_flag.push_back(false);
+ //dout << "this variant is removed" << endl;
+ }else{
+ appliable_flag.push_back(true);
+ }
+ }
+
+ string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+
+ // have subsequence in hand
+ //generate decision point
+ multimap<int, int> * choices_by_pos[2];
+ // choice by pos is to also equal to var by pos
+ for(int i = 0; i < 2; i++){
+ choices_by_pos[i] = new multimap<int, int>();
+ }
+
+ for(int index = 0; index < variant_list.size(); index++){
+ if(!appliable_flag[index]) continue;
+ // remove decision point if not applicable
+ int pos = variant_list[index].pos - offset;
+ int flag = 0;
+ if(variant_list[index].flag) flag = 1;
+ choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+ //dout << pos << index << endl;
+ }
+
+ vector<Interval> mergered_intervals = merge(intervals);
+// unordered_map<int, bool> sync_points;
+// for(int i = 0; i < mergered_intervals.size(); i++){
+// sync_points[mergered_intervals[i].end-offset] = true;
+// }
+ vector<int> sync_points;
+ for(int i = 0; i < mergered_intervals.size(); i++){
+ sync_points.push_back(mergered_intervals[i].end-offset);
+ }
+
+ if(sync_points.back() < subsequence.size() - 1){
+ sync_points.push_back(subsequence.size()-1);
+ }
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ score_unit = score_unit_list[i];
+ for(int j = 0; j < match_mode_list.size(); j++){
+ match_mode = match_mode_list[j];
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ score_scheme = score_scheme_list[k];
+
+ bool method2 = MatchingSingleClusterBaseExtending(
+ cluster_id,
+ thread_index,
+ variant_list,
+ subsequence,
+ offset,
+ choices_by_pos,
+ sync_points,
+ chr_id,
+ score_unit,
+ match_mode,
+ score_scheme);
+ }
+ }
+ }
+ return true;
+}
+
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if(cluster_id >= variants_by_cluster.size()) break;
+ //dout << cluster_id << endl;
+ //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+ vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+ if(vi_list.size() <= 1) continue;
+ // create variant_list from vi_list;
+
+ vector<DiploidVariant> variant_list;
+ int chr_id = -1;
+ for(int i = 0; i < vi_list.size(); i++){
+ VariantIndicator vi = vi_list[i];
+ chr_id = vi.chr_id;
+ int var_id = vi.var_id;
+ DiploidVariant var;
+ if(vi.refer){
+ var = ref_variant_by_chrid[chr_id]->at(var_id);
+ }else{
+ var = que_variant_by_chrid[chr_id]->at(var_id);
+ }
+ variant_list.push_back(var);
+ }
+ if(chr_id == -1 || chr_id >= chrom_num){
+ cout << "[VarMatch] Error in matching single cluster" << endl;
+ continue;
+ }
+
+ MatchVariantListInThread(thread_index,
+ chr_id,
+ variant_list,
+ cluster_id);
+
+ //if(method1 != method2){
+ // cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+ //}
+
+ }
+ return true;
+}
+
+
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+bool WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+ for(int i = 0; i < 2; i++){
+ if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+ // you need to make choices now
+ if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+ // no choice made at current pos
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+{
+
+ if(match_mode == 0){
+ //bool equal_sequences = false;
+ // same ref position, same donor length, same donor sequence, keep
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+ sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ for(int i = 0; i < 2; i++){
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }else{
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ //for(int i = 0; i < 2; i++)
+ int i = 0;
+ {
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }
+}
+
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode){
+ //-1 operation fail, path deleted
+ //0 operation succeed
+ //1 operation fail, need to make decision first, then extend
+ //2 path reached end, need to check if good
+
+ if(sp.reached_sync_num >= sync_points.size()) return -1;
+
+ int start_pos = sp.current_genome_pos + 1;
+ int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+
+ for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+
+ // before make decision, we need to check if the equal property still holds
+ if(PathNeedDecision(sp, choices_by_pos, next_genome_pos)){
+
+ // check equal property
+ int statu = CheckPathEqualProperty(sp, match_mode);
+ if(statu == -1) return -1;
+ return 1; // need decision on next position
+ }
+
+ // else extend one nt
+ for(int i = 0; i < 4; i++){
+
+ if(match_mode == 1){
+ if(i%2 != 0) continue;
+ }
+
+ if(sp.string_sequences[i][next_genome_pos] == "."){
+ sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+ }else{
+ sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+ }
+ }
+ sp.current_genome_pos = next_genome_pos;
+ }
+
+ // reaches the end of end_pos
+ sp.reached_sync_num ++;
+
+ if(sp.reached_sync_num >= sync_points.size()){
+ // last sync point is the end of ref genome sequence
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+ sp.donor_sequences[1] == sp.donor_sequences[3]){
+ return 2;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+ //PrintPath(sp);
+ return -1;
+ }
+ }
+ return CheckPathEqualProperty(sp, match_mode);
+ // first try to converge, then extend
+
+}
+
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme){
+ int score = 0;
+ if(score_unit == 0){
+ score = 1;
+ }else if(score_unit == 1){
+ if(match_mode == 0){
+ if(choice == -1){
+ score += ScoreEditDistance(dv, 0);
+ }else if(choice == 0){
+ score += ScoreEditDistance(dv, 0);
+ if(dv.multi_alts){
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, 0);
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, choice);
+ }
+ }
+
+ if(score_scheme == 0){
+ return score;
+ }else if(score_scheme == 1 || score_scheme == 2){
+ if(dv.flag == false && score_scheme == 1){
+ return score;
+ }else if(dv.flag && score_scheme == 2){
+ return score;
+ }else{
+ return 0;
+ }
+ }
+}
+
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // to maintain existance
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){
+ alts[1] = var.alts[1];
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.multi_alts){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ //for(int y = 0; y < 2; y++)
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+ int c = var_choice[x].second;
+ alts[0] = var.alts[c];
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ int y = 0;
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(k < alts[y].length()){
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+ }
+ // else change nothing
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ // hence k == ref.length()-1, the last position
+ if(k < alts[y].length()){
+ string alt_part = alts[y].substr(k, alts[y].length()-k);
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.heterozygous){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ if(var.multi_alts){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }else{
+ candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+ }
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+// string temp_sequence = reference_sequence.substr(pos, 1);
+// path.string_sequences[x*2][pos] = temp_sequence;
+// path.string_sequences[x*2+1][pos] = temp_sequence;
+// }else{
+ // set score
+
+
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+
+ int c = var_choice[x].second;
+ if(c == -1){
+ alts[0] = ref;
+ alts[1] = var.alts[0];
+ }else{
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ alts[1] = alts[0];
+
+ if(var.multi_alts){
+ // choose 1 or 0
+ alts[1] = var.alts[1- c];
+ }else{
+ // c is 0, choose 0 or -1
+ if(var.heterozygous) alts[1] = ref;
+ }
+ }
+
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ ToUpper(alts[1]);
+ for(int y = 0; y < 2; y++){
+ // iterate two alts
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(k < alts[y].length()){
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+ }
+ // else change nothing
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ // hence k == ref.length()-1, the last position
+ if(k < alts[y].length()){
+ string alt_part = alts[y].substr(k, alts[y].length()-k);
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ // choice made
+ //dout << "after decision at pos " << pos << endl;
+ //PrintPath(path);
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+void WholeGenome::PrintPath(SequencePath & sp){
+ cout << "- Sequence Path:" << endl;
+ cout << "@ String Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ for(int j = 0; j < sp.string_sequences[i].size(); j++){
+ cout << sp.string_sequences[i][j] << " ";
+ }
+ cout << endl;
+ }
+ cout << "@ Donor Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ cout << sp.donor_sequences[i] << endl;
+ }
+ cout << "@ Removable: " << sp.removable << endl;
+}
+
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme){
+ //--------------for unit test------------------------------
+ //dout << variant_list.size() << endl;
+
+ //int chr_id = 0;
+ //-------------end unit test-------------------------------
+
+
+
+ // so a legal sync_points vector contains at least two
+ // first is the end of variant, there should be at least one variant
+ // second is the end of subsequence, there should be at least one nt not influenced by a variant
+
+ list<SequencePath> current_path_list;
+ list<SequencePath> next_path_list;
+ SequencePath sp(subsequence.length());
+ SequencePath best_path = sp;
+ current_path_list.push_back(sp);
+ while(current_path_list.size() != 0){
+ bool reach_sync_point = true;
+ while(current_path_list.size() != 0){
+ SequencePath path = current_path_list.front();
+ current_path_list.pop_front();
+ //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+ //PrintPath(path);
+ int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode);
+ //if(cluster_index == 220730) PrintPath(path);
+ if(is_extend == -1){
+ continue;
+ }
+ else if(is_extend == 0){
+ next_path_list.push_back(path);
+ // here the path is supposed to reach the next sync point
+ }else if(is_extend == 1){
+ if(match_mode == 0){
+ PathMakeDecision(path,
+ variant_list,
+ choices_by_pos,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme);
+ }else{
+ PathMakeDecisionNoGenotype(path,
+ variant_list,
+ choices_by_pos,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme);
+ }
+ }else if(is_extend == 2){
+ if(path.score > best_path.score){
+ best_path = path; // only when you reach the very end can you be considered as best path
+ //PrintPath(best_path);
+ }
+ }
+ }
+ current_path_list = next_path_list;
+ next_path_list.clear();
+ if(current_path_list.size() > 0){
+ //int current_genome_pos = current_path_list.front().current_genome_pos;
+ // after revise, we do not need this check
+ //if(sync_points.find(current_genome_pos) != sync_points.end()){
+ //dout << "converge paths at position: " << current_genome_pos << endl;
+ //dout << "before converge: " << current_path_list.size() << endl;
+ ConvergePaths(current_path_list);
+ //dout << "after converge: " << current_path_list.size() << endl;
+ //}
+ }
+ }
+ // print best_path
+ if(best_path.score <= 0) return false;
+
+ //dout << "new method: " << best_path.score << endl;
+
+ //==========================output ======================
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+
+ if(match_mode == 0){
+ ConstructMatchRecord(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index);
+ }else{
+ ConstructMatchRecordNoGenotype(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index);
+ }
+ return true;
+}
+
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool multiple_match = false;
+
+ if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[1];
+
+ int parsimonious_pos = offset;
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+ vector<int> query_qual_list;
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ if (phasing == -1) phasing = 1;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ query_qual_list.push_back((int)variant.qual);
+ }
+
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts){
+ phasing_string += "2|1";
+ }else{
+ phasing_string += "0|1";
+ }
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+
+ }
+
+ float average_count = (float)truth_num/float(predict_num);
+
+ auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_index]->end();
+ for(int i = 0; i < query_qual_list.size(); i++){
+ int qual = query_qual_list[i];
+ if(quality_que_matchnum_by_thread_mode[thread_index][mode_index]->find(qual) != end_it){
+ quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) += average_count;
+ }else{
+ quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) = average_count;
+ }
+ }
+
+
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+
+
+ baseline_total_match_num[thread_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index]->at(mode_index) += predict_num;
+}
+
+
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool multiple_match = false;
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[0];
+
+ int parsimonious_pos = offset;
+
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ //if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ vector<int> query_qual_list;
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ if (phasing == -1) continue;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ query_qual_list.push_back((int)variant.qual);
+ }
+
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1|1";
+ }else if(phasing == 1){
+ phasing_string += "2|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+
+ }
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+
+ }
+
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+
+ float average_count = (float)truth_num/float(predict_num);
+
+ auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_index]->end();
+ for(int i = 0; i < query_qual_list.size(); i++){
+ int qual = query_qual_list[i];
+ if(quality_que_matchnum_by_thread_mode[thread_index][mode_index]->find(qual) != end_it){
+ quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) += average_count;
+ }else{
+ quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) = average_count;
+ }
+ }
+
+ baseline_total_match_num[thread_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index]->at(mode_index) += predict_num;
+}
+
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+ bool truth_same = false;
+ bool query_same = false;
+
+ if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+ truth_same = true;
+ }
+ else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+ truth_same = true;
+ }
+
+
+ if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+ query_same = true;
+ }
+ else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+ query_same = true;
+ }
+
+ if(truth_same && query_same) return true;
+ return false;
+}
+
+bool IsRemovable(SequencePath & s){ return s.removable;}
+
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+ //dout << "===========start converge===================" << endl;
+ int path_num = path_list.size();
+ if(path_num <= 1) return;
+ for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+ SequencePath ref_path = *i;
+ if(ref_path.removable) continue;
+ if(!ref_path.same_donor_len) continue;
+ list<SequencePath>::iterator j = i;
+ ++j;
+ for(; j != path_list.end(); ++j){
+ SequencePath que_path = *j;
+ if(que_path.removable) continue;
+ if(!que_path.same_donor_len) continue;
+ //dout << "Comparing following paths: " << endl;
+ //PrintPath(ref_path);
+ //PrintPath(que_path);
+ if(DonorLengthEqual(ref_path, que_path)){
+ if(ref_path.score >= que_path.score){
+ (*j).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*j));
+ }else{
+ (*i).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*i));
+ break;
+ }
+ }
+ //dout << "- - - - - - - - - -" << endl;
+ }
+ }
+
+ path_list.remove_if(IsRemovable);
+}
+
+int WholeGenome::test() {
+ genome_sequences[0] = "GTCAGCCGG";
+ DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+ DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+ DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+ DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+ DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+ DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+ DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+ DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+
+ //complex_match_records = new vector<string>*[1];
+ //complex_match_records[0] = new vector<string>;
+ //vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+ vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+ //cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+ //cout << complex_match_records[0]->at(0) << endl;
+ return 0;
+}
+
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+ int start = 0;
+ int cluster_number = variants_by_cluster.size(); // cluster number
+ int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+ int cluster_step = cluster_number / thread_num; // assign clusters to threads
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+ //initialize vector size
+ //complex_match_records = new vector<string>*[thread_num];
+ match_records_by_mode_by_thread = new vector<string>**[thread_num];
+ quality_que_matchnum_by_thread_mode = new map<int, float> ** [thread_num];
+ //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+ match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+ quality_que_matchnum_by_thread_mode[i] = new map<int, float>*[MATCH_MODE_NUM];
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ match_records_by_mode_by_thread[i][j] = new vector<string>;
+ quality_que_matchnum_by_thread_mode[i][j] = new map<int, float>;
+ }
+ }
+
+ baseline_total_match_num = new vector<int>* [thread_num];
+ query_total_match_num = new vector<int> * [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+ baseline_total_match_num[i] = new vector<int>;
+ baseline_total_match_num[i]->resize(MATCH_MODE_NUM, 0);
+ query_total_match_num[i] = new vector<int>;
+ query_total_match_num[i]->resize(MATCH_MODE_NUM, 0);
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= variants_by_cluster.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ ClusteringMatchInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ //output all results
+ ofstream output_stat_file;
+ output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+
+ cout << "=========VarMatch Result Stat.=======" << endl;
+ string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tbaseline_match_num\tquery_match_num";
+ cout << stat_head_string << endl;
+ output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+ output_stat_file << "##Query:"<< query_variant_total_num << endl;
+ output_stat_file << stat_head_string << endl;
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ int total_ref_complex = 0;
+ int total_que_complex = 0;
+
+ int baseline_match_num_by_threshold_by_mode = 0;
+ int query_match_num_by_threshold_by_mode = 0;
+
+ for(int i = 0; i < thread_num; i++){
+ baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i]->at(mode_index);
+ query_match_num_by_threshold_by_mode += query_total_match_num[i]->at(mode_index);
+ }
+
+ string baseline_match_num_string = to_string(baseline_match_num_by_threshold_by_mode);
+ string query_match_num_string = to_string(query_match_num_by_threshold_by_mode);
+
+ string total_match_num_string = to_string(score_unit) + "\t" +
+ to_string(match_mode) + "\t" +
+ to_string(score_scheme) + "\t" +
+ baseline_match_num_string + "\t" +
+ query_match_num_string;// + "\t" + to_string(mode_index);
+ cout << total_match_num_string << endl;
+ output_stat_file << total_match_num_string << endl;
+ }
+ }
+ }
+ output_stat_file.close();
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+
+ output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+ output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+
+ for(int i = 0; i < thread_num; i++){
+ for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+ if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+ output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+ }
+ }
+ }
+ output_complex_file.close();
+ }
+ }
+ }
+
+ map<int, float> query_qual_matchnum[MATCH_MODE_NUM];
+
+ for(int i = 0; i < mode_index_list.size(); i++){
+
+ int mode_index = mode_index_list[i];
+ for(int t = 0; t < thread_num; t++){
+ auto matchmap_pointer = quality_que_matchnum_by_thread_mode[t][mode_index];
+ for(auto it = matchmap_pointer->begin(); it != matchmap_pointer->end(); ++it){
+ int qual = it->first;
+ if(query_qual_matchnum[mode_index].find(qual) != query_qual_matchnum[mode_index].end()){
+ query_qual_matchnum[mode_index][qual] += 1.0;
+ }else{
+ query_qual_matchnum[mode_index][qual] = 1.0;
+ }
+
+ }
+ }
+ }
+ map<int, float> query_qual_accumulated_totalnum;
+
+ for(int i = 0; i < mode_index_list.size(); i++){
+ int mode_index = mode_index_list[i];
+ for(auto it = query_qual_totalnum.begin(); it!= query_qual_totalnum.end(); ++it){
+ map<int, float> query_qual_accumulated_matchnum;
+ }
+
+ }
+ map<int, float> roc_xy [MATCH_MODE_NUM];
+
+
+ ofstream output_roc_file;
+ output_roc_file.open(output_dir + "/" + output_prefix+".roc");
+
+ output_roc_file.close();
+ // clear all matching records
+ for(int i = 0; i < thread_num; i++){
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ delete match_records_by_mode_by_thread[i][j];
+ delete quality_que_matchnum_by_thread_mode[i][j];
+ }
+ delete[] match_records_by_mode_by_thread[i];
+ delete[] quality_que_matchnum_by_thread_mode[i];
+ delete baseline_total_match_num[i];
+ delete query_total_match_num[i];
+ }
+ delete[] match_records_by_mode_by_thread;
+ delete[] quality_que_matchnum_by_thread_mode;
+ delete[] baseline_total_match_num;
+ delete[] query_total_match_num;
+
+}
+
+
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+
+ int left_index = pos;
+ if (genome_sequences[chr_id].size() == 0) return -1;
+ if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+ parsimonious_ref.pop_back();
+ parsimonious_alt0.pop_back();
+ parsimonious_alt1.pop_back();
+ change_in_allels = true;
+ }
+ // else do not make further changes
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+ left_index--;
+ char left_char = toupper(genome_sequences[chr_id][left_index]);
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt0 = left_char + parsimonious_alt0;
+ parsimonious_alt1 = left_char + parsimonious_alt1;
+ }
+ }
+ while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+ parsimonious_ref[0] == parsimonious_alt1[0] &&
+ parsimonious_ref.size() > 1 &&
+ parsimonious_alt0.size() > 1 &&
+ parsimonious_alt1.size() > 1)
+ {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt0.erase(0, 1);
+ parsimonious_alt1.erase(0, 1);
+ left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+ }
+ return left_index;
+}
+
+void WholeGenome::SingleThreadClustering(int chr_id) {
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = 0;
+ int c_end = 0;
+ sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+ sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+ int ref_size = ref_variant_by_chrid[chr_id]->size();
+ int que_size = que_variant_by_chrid[chr_id]->size();
+ //dout << chr_id << "," << ref_size << "," << que_size << endl;
+
+ int ref_index = 0;
+ int que_index = 0;
+ bool not_first = false;
+ DiploidVariant snp;
+ vector<VariantIndicator> vi_list;
+ while (ref_index < ref_size || que_index < que_size) {
+ bool take_que = true;
+ if(ref_index < ref_size && que_index < que_size){
+ if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+ take_que = false;
+ }
+ }else if(ref_index < ref_size){
+ take_que = false;
+ }
+ int var_index;
+ if(take_que){
+
+ snp = que_variant_by_chrid[chr_id]->at(que_index);
+ //cout << "q |" << que_index << "," << snp.pos << endl;
+ var_index = que_index;
+ que_index++;
+ }else{
+ snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+ //cout << "r |" << ref_index << "," << snp.pos << endl;
+ var_index = ref_index;
+ ref_index++;
+ }
+ // check if need to separator clusters
+ if (not_first) {
+ c_end = snp.pos;
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+ }
+
+ if(separate_cluster){
+ variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+ vi_list.clear();
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+ VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+ vi_list.push_back(current_variant_indicator);
+ //cluster_vars_map[cluster_index].push_back(snp);
+ if(!not_first) not_first = true;
+ int ref_length = (int)(snp.ref.length());
+ int flag = 0;
+ if(snp.flag) flag = 1;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ ins_len[flag] += snp.mil;
+ del_len[flag] += snp.mdl;
+ }
+}
+
+int WholeGenome::ReadReferenceVariants(string filename){
+ return ReadWholeGenomeVariant(filename, false);
+}
+
+int WholeGenome::ReadQueryVariants(string filename){
+ return ReadWholeGenomeVariant(filename, true);
+}
+
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+
+ ReadWholeGenomeSequence(genome_seq);
+ baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+ ref_vcf_filename = ref_vcf;
+
+}
+
+void WholeGenome::Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results)
+{
+ // initialize query variant data structure
+ que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ que_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ if(score_scheme_indicator == 3){
+ DirectMatch(ref_vcf_filename, query_vcf);
+ return;
+ }
+ que_vcf_filename = query_vcf;
+
+ this->output_prefix = output_prefix;
+ this->detail_results = detail_results;
+
+ query_variant_total_num = ReadQueryVariants(query_vcf);
+ cout << "Baseline VCF: " << ref_vcf_filename << endl;
+ cout << "Query VCF: " << query_vcf << endl;
+ cout << "========VCF Stat.==========" << endl;
+ cout << "Total Number of VCF Entries: " << endl;
+ cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+
+ ParallelClustering();
+ ClusteringMatchMultiThread();
+
+ // most clustering results are cleared inside ParallelClustering function except the following one
+ // which is needed for matching
+ variants_by_cluster.clear();
+ // clean at the end of function
+ for(int j = 0; j < chrom_num; j++){
+ que_variant_by_chrid[j]->clear();
+ delete que_variant_by_chrid[j];
+ }
+ delete[] que_variant_by_chrid;
+
+ query_variant_strings.clear();
+ query_variant_total_num = 0;
+ quality_que_totalnum.clear();
+ // The following three matching results are cleared inside ClusteringMatchMultiThread function
+ // match_records_by_mode_by_thread;
+ // baseline_total_match_num;
+ // query_total_match_num;
+
+ return;
+}
+
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf)
+{
+ //dout << "direct match" << endl;
+ int ref_variant_num = ReadReferenceVariants(ref_vcf);
+ int que_variant_num = ReadQueryVariants(query_vcf);
+ dout << ref_variant_num << "," << que_variant_num << endl;
+ int match_num = 0;
+ for(int i = 0; i < chrom_num; i++){
+ if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+ continue;
+ //[TODO] not the right way to do it, at least need multimap
+ multimap<int, int> ref_variant_by_pos;
+ for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ ref_variant_by_pos.insert(pair<int, int>(pos, j));
+ }
+
+ for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = que_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+ continue;
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = ref_variant_by_pos.equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int ref_index = (*it).second;
+ DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+ if (match_mode_indicator != 1 && var == ref_var){
+ match_num ++;
+ break;
+ }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+ match_num ++;
+ break;
+ }
+ }
+ }
+ }
+ dout << "matched variants: " << match_num << endl;
+}
diff --git a/src/wholegenome_backup.h b/src/wholegenome_backup.h
new file mode 100644
index 0000000..8746e48
--- /dev/null
+++ b/src/wholegenome_backup.h
@@ -0,0 +1,274 @@
+#pragma once
+
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+
+typedef struct VariantIndicator{
+ VariantIndicator(int chr_id_ = -1,
+ int var_id_ = -1,
+ bool refer_ = true) :
+ chr_id(chr_id_),
+ var_id(var_id_),
+ refer(refer_){}
+
+ char chr_id;
+ int var_id;
+ bool refer;
+}VariantIndicator;
+
+typedef struct Interval {
+ int start;
+ int end;
+ Interval() : start(0), end(0) {}
+ Interval(int s, int e) : start(s), end(e) {}
+}Interval;
+
+class SequencePath{
+public:
+ SequencePath(int n)
+ {
+ reference_length = n;
+ for(int i = 0; i < 4; i++){
+ string_sequences[i].resize(n, ".");
+ // default value is "."
+ donor_sequences[i] = "";
+ }
+ current_genome_pos = -1;
+ score = 0;
+ removable = false;
+ same_donor_len = false;
+ current_equal_donor_pos[0] = -1;
+ current_equal_donor_pos[1] = -1;
+ reached_sync_num = 0;
+ }
+ int reference_length;
+ vector<string> string_sequences[4];
+ map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+ // one choice is a pair: variant id, phasing index
+ int current_genome_pos;
+ string donor_sequences[4];
+ int current_equal_donor_pos[2];
+ int score;
+ bool removable;
+ bool same_donor_len;
+ int reached_sync_num;
+};
+
+class WholeGenome{
+private:
+ int chrom_num;
+ int thread_num;
+ string ref_vcf_filename;
+ string que_vcf_filename;
+ int baseline_variant_total_num;
+ int query_variant_total_num;
+ vector<string> baseline_variant_strings;
+ vector<string> query_variant_strings;
+ bool detail_results;
+
+ //int thread_num; VCF->DiploidVariant->WholeGenome
+protected:
+ map<string, int> chrid_by_chrname;
+ map<int, string> chrname_by_chrid;
+ map<string, int> chrname_dict;
+ map<int, string> genome_sequences;
+ vector<DiploidVariant> ** ref_variant_by_chrid;
+ vector<DiploidVariant> ** que_variant_by_chrid;
+ vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+ // so here cluster is represented as vector<vector<VariantIndicator>>
+ // and we create a list of pointers point to cluster
+ // and we hold the point to that list
+
+ vector<vector<VariantIndicator>> variants_by_cluster;
+
+ vector<string> *** match_records_by_mode_by_thread;
+ //vector<int> *** baseline_matches_by_mode_by_thread;
+ //vector<int> *** query_matches_by_mode_by_thread;
+ vector<int> ** baseline_total_match_num;
+ vector<int> ** query_total_match_num;
+
+ map<int, float> *** quality_que_matchnum_by_thread_mode;
+
+ //map<float, int> *** tp_qual_num_by_mode_by_thread;
+ //map<float, int> *** fp_qual_num_by_mode_by_thread;
+
+ //map<float, int> query_total_qual_num;
+
+ string output_prefix;
+ string output_dir;
+ // copy the above into this.
+
+ int score_unit_indicator;
+ int match_mode_indicator;
+ int score_scheme_indicator;
+
+ vector<int> score_unit_list;
+ vector<int> match_mode_list;
+ vector<int> score_scheme_list;
+ vector<int> mode_index_list;
+
+ map<int, float> quality_que_totalnum;
+
+ bool ReadWholeGenomeSequence(string filename);
+ bool ReadGenomeSequenceList(string filename);
+ int ReadWholeGenomeVariant(string filename, bool flag);
+ bool ReadVariantFileList(string filename);
+ int ReadReferenceVariants(string filename);
+ int ReadQueryVariants(string filename);
+ bool ParallelClustering(); // parallel by chr id
+ bool ParallelMatching(); // parallel by task
+ bool TBBMatching();
+
+ void SingleThreadClustering(int chr_id);
+ //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+
+ //override
+ bool ClusteringMatchInThread(int start, int end, int thread_index);
+ void ClusteringMatchMultiThread();
+ int NormalizeVariantSequence(int pos,
+ string & parsimonious_ref,
+ string & parsimonious_alt0,
+ string & parsimonious_alt1,
+ int chr_id);
+
+ struct compInterval {
+ bool operator()(const Interval &a, const Interval &b) const {
+ return a.start<b.start;
+ }
+ };
+
+ vector<Interval> merge(vector<Interval> &intervals) {
+ sort(intervals.begin(),intervals.end(),compInterval());
+ vector<Interval> results;
+ for(int i=0; i<intervals.size(); i++) {
+ if(results.empty() || results.back().end < intervals[i].start) // no overlap
+ results.push_back(intervals[i]);
+ else // overlap
+ results.back().end = max(results.back().end, intervals[i].end);
+ }
+ return results;
+ }
+
+ bool PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+ int PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode);
+
+ bool PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+ void ConvergePaths(list<SequencePath> & path_list);
+ int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+
+ int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+ int EditDistance(const std::string& s1, const std::string& s2);
+ bool PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ void ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index);
+
+ void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index);
+
+ int CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+ bool ClearQuery();
+
+ inline void ToUpper(string & s){
+ transform(s.begin(), s.end(), s.begin(), ::toupper);
+ }
+
+ bool CheckTandemRepeat(string sequence, int unit_threshold);
+
+ bool MatchVariantListInThread(int thread_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id);
+
+
+public:
+ WholeGenome(int thread_num_,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_,
+ string output_dir_);
+
+ ~WholeGenome();
+
+ void ReadRef(string genome_seq,
+ string ref_vcf);
+
+ void Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results);
+
+ void DirectMatch(string ref_vcf,
+ string query_vcf);
+
+ int test(); // for direct test
+ void PrintPath(SequencePath & sp);
+
+ const static int MATCH_MODE_NUM = 16;
+ const static int VAR_LEN = 100;
+ const static int MAX_REPEAT_LEN = 1000;
+ const static int ROC_SAMPLE_NUM = 5;
+};
diff --git a/src/wholegenome_working.cpp b/src/wholegenome_working.cpp
new file mode 100644
index 0000000..b796b17
--- /dev/null
+++ b/src/wholegenome_working.cpp
@@ -0,0 +1,2471 @@
+#include "wholegenome.h"
+
+using namespace std;
+
+WholeGenome::WholeGenome(int thread_num_,
+ string output_dir_,
+ bool pr_curves){
+
+ thread_num = thread_num_;
+ chrom_num = 24;
+
+ output_dir = output_dir_;
+
+ //thread_num = thread_num_;
+ //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+
+ ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+
+ for (int j = 0; j < chrom_num; j++) {
+ ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ que_variant_by_chrid[j] = new vector<DiploidVariant>;
+ }
+
+ // chr_id starts from 0
+ for(int j = 1; j <= 22; j++){
+ string chr_name = to_string(j);
+ chrname_dict[chr_name] = j-1;
+ chr_name = "chr"+chr_name;
+ chrname_dict[chr_name] = j-1;
+ }
+ chrname_dict["X"] = 22;
+ chrname_dict["chrX"] = 22;
+ chrname_dict["Y"] = 23;
+ chrname_dict["chrY"] = 23;
+
+ if(pr_curves){
+ per_list = {0.0, 0.1, 0.2, 0.3, 0.9};
+ }else{
+ per_list = {0.0};
+ }
+
+}
+
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+ int result = 0;
+ result |= score_unit & 1;
+ result <<= 1;
+ result |= match_mode & 1;
+ result <<= 2;
+ result |= score_scheme & 3;
+ return result;
+}
+
+WholeGenome::~WholeGenome(){
+
+ for(int j = 0; j < chrom_num; j++){
+ ref_variant_by_chrid[j]->clear();
+ delete ref_variant_by_chrid[j];
+ que_variant_by_chrid[j]->clear();
+ delete que_variant_by_chrid[j];
+ }
+ delete[] ref_variant_by_chrid;
+ delete[] que_variant_by_chrid;
+}
+
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+ std::ifstream input(filename);
+ if(!input.good()){
+ std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+ return false;
+ }
+
+ std::string line, name, content;
+ int real_chrom_num = 0;
+ int chr_id = 0;
+ int current_id = -1;
+ while( std::getline( input, line ).good() ){
+ if( line.empty() || line[0] == '>' ){ // Identifier marker
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ //int chr_id = chrname_dict[name];
+ if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+ chrid_by_chrname[name] = chr_id;
+ chr_id++;
+ }
+ current_id = chrid_by_chrname[name];
+ chrname_by_chrid[current_id] = name;
+ genome_sequences[current_id] = content;
+ real_chrom_num++;
+ name.clear();
+ }
+ if( !line.empty() ){
+ name = split(line, ' ')[0].substr(1);
+ }
+ content.clear();
+ } else if( !name.empty() ){
+ if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+ name.clear();
+ content.clear();
+ } else {
+ content += line;
+ }
+ }
+ }
+ if( !name.empty() ){ // Print out what we read from the last entry
+ //std::cout << name << " : " << content << std::endl;
+ if(chrname_dict.find(name) == chrname_dict.end()){
+ cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+ return false;
+ }
+ if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+ chrid_by_chrname[name] = chr_id;
+ chr_id++;
+ }
+ current_id = chrid_by_chrname[name];
+ chrname_by_chrid[current_id] = name;
+ genome_sequences[current_id] = content;
+ real_chrom_num++;
+ }
+ // test
+
+ chrom_num = real_chrom_num;
+ //dout << "detected chromosome num: " << chrom_num << endl;
+// for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+// cout << it->first << ":" << (it->second).length();
+// }
+ return true;
+}
+
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+
+}
+
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+ int total_num = 0;
+ int long_num = 0;
+ double QUAL_LOWER_BOUND = 0.1;
+
+ ifstream vcf_file;
+ vcf_file.open(filename.c_str());
+ if (!vcf_file.good()) {
+ cout << "[VarMatch] Error: can not open vcf file" << endl;
+ return -1;
+ }
+
+ vector<float> quality_list;
+
+ int genotype_index = -1;
+ char genotype_separator = '/';
+ //int genome_sequence_length = genome_sequence.length();
+ while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+ string line;
+ getline(vcf_file, line, '\n');
+ // check ineligible lines
+ //dout << line << endl;
+ if ((int)line.length() <= 1) continue;
+ //if (line.find_first_not_of(' ') == std::string::npos) continue;
+
+ if (line[0] == '#') {
+ continue;
+ }
+ auto columns = split(line, '\t');
+ if (columns.size() < 10) {
+ if(match_mode_indicator != 1){
+ cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+ match_mode_indicator = 1;
+ //continue;
+ }
+ if(columns.size() < 6){
+ cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+ cout << "[VarMatch] skip current variant: " << line << endl;
+ continue;
+ }
+ }
+ string chr_name = columns[0];
+ auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+
+ auto ref = columns[3];
+ auto alt_line = columns[4];
+ double quality = stod(columns[5]);
+
+ if(flag){
+ quality_list.push_back(quality);
+ }
+
+ ToUpper(ref);
+ ToUpper(alt_line);
+
+ bool is_heterozygous_variant = false;
+ bool is_multi_alternatives = false;
+
+ if (match_mode_indicator != 1) { // match mode indicator is -1 or 0
+ if (genotype_index < 0) {
+ // change genotype index
+ auto formats = split(columns[8], ':');
+ for (int i = 0; i < formats.size(); i++) {
+ if (formats[i] == "GT") {
+ genotype_index = i;
+ break;
+ }
+ }
+ // if GT not found
+ if(genotype_index < 0){
+ if(match_mode_indicator != 1){
+ cout << "[VarMatch] Warning: VCF entry does not contain genotype information." << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching mode. " << endl;
+ match_mode_indicator = 1;
+ }
+ }
+ }
+
+ if(match_mode_indicator != 1){
+
+ auto additionals = split(columns[9], ':');
+ vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+
+ if(genotype_columns.size() != 2){
+
+ if(genotype_separator == '/'){
+ genotype_separator = '|';
+ }else{
+ genotype_separator = '/';
+ }
+ genotype_columns = split(additionals[genotype_index], genotype_separator);
+ }
+
+ // normalize format of genotype: sorted, separated by |
+ if (genotype_columns.size() != 2) {
+ cout << "[VarMatch] Warning: Unrecognized Genotype: " << additionals[genotype_index] << endl;
+ cout << "[VarMatch] \tAutomatically turn off genotype matching mode." << endl;
+ match_mode_indicator = 1;
+ }
+ else {
+ if (genotype_columns[0] != genotype_columns[1]) {
+ is_heterozygous_variant = true;
+ }
+ if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+ //cout << "Skip Variants when both genotype is refernce allele: " << line << endl;
+
+ continue;
+ }
+ }
+ }
+ }
+
+ vector<string> alt_list;
+ if (alt_line.find(",") != std::string::npos) {
+ alt_list = split(alt_line, ',');
+ is_multi_alternatives = true;
+ }
+ else {
+ alt_list.push_back(alt_line);
+ }
+
+ int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+ int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+ if(is_multi_alternatives){
+ snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+ snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+ }
+
+ if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+ //dout << "[VarMatch] skip large INDEL with length > " << VAR_LEN << "| "<< line <<endl;
+ long_num ++;
+ continue;
+ }
+
+ DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality);
+ //if (normalization) {
+ //NormalizeDiploidVariant(dv);
+ //}
+ if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+ int chr_id = chrid_by_chrname[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }else{
+ cout << "[VarMatch] skip current variant as no corresponding reference genome sequence found." << endl;
+ continue;
+ int chr_id = chrname_dict[chr_name];
+ if(flag == false){
+ ref_variant_by_chrid[chr_id]->push_back(dv);
+ //baseline_variant_strings.push_back(line);
+ }else{
+ que_variant_by_chrid[chr_id]->push_back(dv);
+ query_variant_strings.push_back(line);
+ }
+ }
+
+ total_num++;
+ }
+ vcf_file.close();
+
+ if(flag){
+ sort(quality_list.begin(), quality_list.end());
+ auto qual_lower_it = lower_bound(quality_list.begin(), quality_list.end(), QUAL_LOWER_BOUND);
+ int qual_lower_index = qual_lower_it - quality_list.begin();
+ int rest_size = quality_list.size() - qual_lower_index;
+
+ vector<float> temp_percentage_list;
+ temp_percentage_list.push_back(0.0);
+ threshold_list.push_back(0.0);
+
+ for(int i = 1; i < per_list.size(); i++){
+ int additional_index = (int)(rest_size * per_list[i]);
+ int real_index = qual_lower_index + additional_index;
+ if(real_index >= quality_list.size()) real_index = quality_list.size() - 1;
+ double quality = quality_list[real_index];
+ threshold_list.push_back(quality);
+
+ auto quality_lowit = lower_bound(quality_list.begin(), quality_list.end(), quality);
+ int quality_low_index = quality_lowit - quality_list.begin();
+ // following program will retain variants >= quality threshold
+
+ int quality_size = quality_low_index + 1; // counting number, +/- 1 does not matter
+ if(quality_size > quality_list.size()) quality_size = quality_list.size();
+ double percentage = (double)quality_size/ quality_list.size();
+ temp_percentage_list.push_back(percentage);
+ }
+ threshold_num = threshold_list.size();
+ // revice percentage
+ per_list = temp_percentage_list;
+ }
+ cout << flag << "," << total_num << "," << long_num << endl;
+ return total_num;
+}
+
+bool WholeGenome::ReadVariantFileList(string filename){
+
+}
+
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+ return EditDistance(dv.ref, dv.alts[allele_indicator]);
+}
+
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+{
+ const std::size_t len1 = s1.size(), len2 = s2.size();
+ std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+
+ for (unsigned int i = 0; i < prevCol.size(); i++)
+ prevCol[i] = i;
+ for (unsigned int i = 0; i < len1; i++) {
+ col[0] = i+1;
+ for (unsigned int j = 0; j < len2; j++)
+ // note that std::min({arg1, arg2, arg3}) works only in C++11,
+ // for C++98 use std::min(std::min(arg1, arg2), arg3)
+ col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+ col.swap(prevCol);
+ }
+ return prevCol[len2];
+}
+
+// Needleman Wunsch Initialization
+inline void WholeGenome::initialize_score_matrix(int **score, char **trackBack, int M, int N)
+{
+ for (int i = 0; i < M+1; i++)
+ {
+ score[0][i] = i * -1;
+ trackBack[0][i] = '-';
+ }
+
+ for (int i = 0; i < N+1; i++)
+ {
+ score[i][0] = i * -1;
+ trackBack[i][0] = '|';
+ }
+
+ trackBack[0][0] = '*';
+}
+
+int WholeGenome::needleman_wunsch(string S1, string S2, string &R1, string &R2)
+{
+ int M = S1.length();
+ int N = S2.length();
+ /*
+ 0MMMMMMMMMMMMMMMM
+ N
+ N
+ N
+ N
+ N
+ N
+ so the matrix is N*M
+ */
+ int **score = new int *[N+1];
+ for (int i = 0; i <= N; i++)
+ {
+ score[i] = new int [M+1];
+ }
+
+ char **trackBack = new char *[N+1];
+ // * for match, - for ->, | for moving downward
+ for (int i = 0; i <= N; i++)
+ {
+ trackBack[i] = new char [M+1];
+ }
+ R1 = "";
+ R2 = "";
+ initialize_score_matrix(score, trackBack, M, N);
+
+ for (int i = 1; i <=N; i++)
+ {
+ for (int k = 1; k <= M; k++)
+ {
+ char S1_k = S1[k-1];
+ char S2_i = S2[i-1];
+ int matchingCost = score[i-1][k-1];
+ if(S1_k != S2_i) matchingCost--;
+ int rightCost = score[i][k-1] - 1;
+ int downCost = score[i-1][k] - 1;
+ if (matchingCost > rightCost && matchingCost > downCost)
+ {
+ score[i][k] = matchingCost;
+ trackBack[i][k] = '*';
+ }else if(rightCost >= downCost)
+ {
+ score[i][k] = rightCost;
+ trackBack[i][k] = '-';
+ }else
+ {
+ score[i][k] = downCost;
+ trackBack[i][k] = '|';
+ }
+ }
+ }
+
+ //trackBack
+ int n = N;
+ int m = M;
+ while(n > 0 || m > 0)
+ {
+ if (trackBack[n][m] == '*')
+ {
+ R1 += S1[m-1];
+ R2 += S2[n-1];
+ n--;
+ m--;
+ }else if(trackBack[n][m] == '-')
+ {
+ R1 += S1[m-1];
+ R2 += '-';
+ m--;
+ }else if(trackBack[n][m] == '|')
+ {
+ R1 += '-';
+ R2 += S2[n-1];
+ n--;
+ }
+ }
+ reverse(R1.begin(), R1.end());
+ reverse(R2.begin(), R2.end());
+ return score[N][M];
+}
+
+void WholeGenome::GenerateAltVector(string ref, string alt, vector<string> & alt_vector){
+ if(ref.size() == 0) return;
+ string ref_match = "";
+ string alt_match = "";
+ needleman_wunsch(ref, alt, ref_match, alt_match);
+ int current_ref_index = -1;
+ for(int i = 0; i < ref.size(); i++){
+ alt_vector.push_back("");
+ }
+
+ for(int i = 0; i < ref_match.size(); i++){
+ if(ref_match[i] == '-'){
+ if(current_ref_index < 0){
+ alt_vector[0].push_back(alt_match[i]);
+ }else{
+ alt_vector[current_ref_index].push_back(alt_match[i]);
+ }
+ }else if(alt_match[i] == '-'){
+ // pass
+ current_ref_index ++;
+ }else{
+ current_ref_index ++;
+ if(current_ref_index >= ref.size()){
+ alt_vector[ref.size()-1].push_back(alt_match[i]);
+ }
+ alt_vector[current_ref_index].push_back(alt_match[i]);
+ }
+ }
+ return;
+}
+
+bool WholeGenome::ParallelClustering(){
+ // parallel by chr
+ variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+ for (int j = 0; j < chrom_num; j++) {
+ variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+ }
+
+ int parallel_steps = chrom_num / thread_num;
+ if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+ int chr_id = 0;
+ for(int i = 0; i < parallel_steps; i++){
+ vector<thread> threads;
+ for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ if(ref_variant_by_chrid[chr_id]->size() > 0 && que_variant_by_chrid[chr_id]->size() > 0){
+ threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+ }
+ }
+ chr_id ++;
+ }
+ if(chr_id < chrom_num){
+ if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+ SingleThreadClustering(chr_id);
+ }
+ chr_id ++;
+ }
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+ threads.clear();
+ }
+
+
+ for(int i = 0; i < chrom_num; i++){
+ if(variant_cluster_by_chrid[i]->size() > 0){
+ variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+ }
+ }
+
+ // test output
+ //dout << endl;
+ map<int, int> size_num;
+ map<int, int> size_chrid;
+ for(int i = 0; i < chrom_num; i++){
+ //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+ for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+ int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+ if(size_num.find(temp_size) != size_num.end()){
+ size_num[temp_size] ++;
+ }else{
+ size_num[temp_size] = 1;
+ }
+ if(size_chrid.find(temp_size) == size_chrid.end()){
+ size_chrid[temp_size] = i;
+ }
+ }
+ }
+
+ //cout << endl;
+ //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+ // dout << it->first << ": " << it->second << endl;
+ //}
+
+// cout << endl;
+// cout << "size and location:" << endl;
+// for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+// dout << it->first << ": " << it->second << endl;
+// }
+ // clean at the end of function
+
+ for(int j = 0; j < chrom_num; j++){
+ variant_cluster_by_chrid[j]->clear();
+ delete variant_cluster_by_chrid[j];
+ }
+ delete[] variant_cluster_by_chrid;
+
+ return true;
+}
+
+bool WholeGenome::ParallelMatching(){
+
+}
+
+bool WholeGenome::TBBMatching()
+{
+
+}
+
+
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+ int sequence_length = (int)sequence.length();
+ //cout << sequence_length << "," << unit_threshold << endl;
+ if(sequence_length == 1) return true;
+ transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+ int end_index = sequence_length / 2 + 1;
+ bool final_checking = false;
+ int repeat_threshold = min(end_index-1, unit_threshold);
+ for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+ bool is_tandem_repeat = true;
+ int repeat_time = 1;
+ string repeat_region = sequence.substr(0, repeat_length);
+ int start_position = repeat_length;
+ while (start_position < sequence_length) {
+ if (start_position + repeat_length > sequence_length)
+ break;
+ string matching_region = sequence.substr(start_position, repeat_length);
+ if (matching_region != repeat_region) {
+ is_tandem_repeat = false;
+ break;
+ }
+ start_position += repeat_length;
+ repeat_time ++;
+ }
+ if (is_tandem_repeat && repeat_time > 1) {
+ final_checking = true;
+ break;
+ }
+ }
+ return final_checking;
+}
+
+bool WholeGenome::MatchVariantListInThread(int thread_index,
+ int threshold_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id){
+ //===================================================
+ sort(variant_list.begin(), variant_list.end());
+ // decide reference sequence
+ vector<DiploidVariant> separate_var_list[2];
+ vector<Interval> intervals;
+ // separate into ref and que
+ int total_mil = 0;
+ int total_mdl = 0;
+ int min_pos = genome_sequences[chr_id].length() + 1;
+ int max_pos = -1;
+ for (int i = 0; i < variant_list.size(); i++) {
+ int flag = 0;
+ if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+ int pos = variant_list[i].pos;
+ separate_var_list[flag].push_back(variant_list[i]);
+ total_mil += variant_list[i].mil;
+ total_mdl += variant_list[i].mdl;
+ auto ref_sequence = variant_list[i].ref;
+ auto alt_sequences = variant_list[i].alts;
+ min_pos = min(pos, min_pos);
+ max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+
+ int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+ intervals.push_back(Interval(pos, end_pos));
+ }
+ min_pos = max(min_pos - 1, 0);
+ max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+
+ if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+ //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+ return false;
+ }
+ if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+ // try direct match to save time
+ if(separate_var_list[0][0] == separate_var_list[1][0]){
+
+ DiploidVariant tv = separate_var_list[0][0];
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+ if(tv.multi_alts) match_record += "/" + tv.alts[1];
+ match_record += "\t.\t.\t.\t.\t.\n";
+ // here we need to push back for all mode_index
+ //complex_match_records[thread_index]->push_back(match_record);
+ for(int mi = 0; mi < mode_index_list.size(); mi ++){
+ int mode_i = mode_index_list[mi];
+ //if(mi == 0){
+ match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+ //}else{
+ // match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$"+to_string(match_records_by_mode_by_thread[thread_index][0]->size()));
+ // use dollor to represent that it is the same
+ //}
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+ query_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+ }
+ // output match result
+ return true;
+ }
+ // if not match, still can match by changing genome
+ }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+ int flag = 0;
+ if(separate_var_list[1].size() == 1) flag = 1;
+ int r_flag = 1-flag;
+ if(separate_var_list[r_flag].size() > 4){
+ int total_r_mdl = 0;
+ int total_r_mil = 0;
+
+ for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+ DiploidVariant var = separate_var_list[r_flag][k];
+ int var_mdl = var.mdl;
+ int var_mil = var.mil;
+ int ref_length = var.ref.length();
+ total_r_mdl += var_mdl;
+ total_r_mil += var_mil;
+ }
+
+ if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+ }
+ }
+
+ // remove singular variant
+ // [todo] try removing this filter to see running time changes
+ vector<bool> appliable_flag;
+ int total_change = total_mil+total_mdl;
+
+ for(int k = 0; k < variant_list.size(); k++){
+ DiploidVariant cur_var = variant_list[k];
+ int max_change = max(cur_var.mil, cur_var.mdl);
+ if(max_change > total_change-max_change){
+ appliable_flag.push_back(false);
+ //dout << "this variant is removed" << endl;
+ }else{
+ appliable_flag.push_back(true);
+ }
+ }
+
+ string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+
+ ToUpper(subsequence); // subsequence only contains upper char
+ int offset = min_pos;
+ int subsequence_length = max_pos - min_pos;
+
+ // have subsequence in hand
+ //generate decision point
+ multimap<int, int> * choices_by_pos[2];
+ // choice by pos is to also equal to var by pos
+ for(int i = 0; i < 2; i++){
+ choices_by_pos[i] = new multimap<int, int>();
+ }
+
+ for(int index = 0; index < variant_list.size(); index++){
+ if(!appliable_flag[index]) continue;
+ // remove decision point if not applicable
+ int pos = variant_list[index].pos - offset;
+ int flag = 0;
+ if(variant_list[index].flag) flag = 1;
+ choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+ //dout << pos << index << endl;
+ }
+
+ vector<Interval> mergered_intervals = merge(intervals);
+// unordered_map<int, bool> sync_points;
+// for(int i = 0; i < mergered_intervals.size(); i++){
+// sync_points[mergered_intervals[i].end-offset] = true;
+// }
+ vector<int> sync_points;
+ for(int i = 0; i < mergered_intervals.size(); i++){
+ sync_points.push_back(mergered_intervals[i].end-offset);
+ }
+
+ if(sync_points.back() < subsequence.size() - 1){
+ sync_points.push_back(subsequence.size()-1);
+ }
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ score_unit = score_unit_list[i];
+ for(int j = 0; j < match_mode_list.size(); j++){
+ match_mode = match_mode_list[j];
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ score_scheme = score_scheme_list[k];
+
+ bool method2 = MatchingSingleClusterBaseExtending(
+ cluster_id,
+ thread_index,
+ variant_list,
+ subsequence,
+ offset,
+ choices_by_pos,
+ sync_points,
+ chr_id,
+ score_unit,
+ match_mode,
+ score_scheme,
+ threshold_index);
+ }
+ }
+ }
+ return true;
+}
+
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+
+ for (int cluster_id = start; cluster_id < end; cluster_id++) {
+ if(cluster_id >= variants_by_cluster.size()) break;
+ //dout << cluster_id << endl;
+ //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+ vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+ if(vi_list.size() <= 1) continue;
+ // create variant_list from vi_list;
+
+ for(int t = 0; t < threshold_num; t++){
+
+ double quality_threshold = threshold_list[t];
+
+ vector<DiploidVariant> variant_list;
+ int chr_id = -1;
+ for(int i = 0; i < vi_list.size(); i++){
+ VariantIndicator vi = vi_list[i];
+ chr_id = vi.chr_id;
+ int var_id = vi.var_id;
+ DiploidVariant var;
+ if(vi.refer){
+ var = ref_variant_by_chrid[chr_id]->at(var_id);
+ }else{
+ var = que_variant_by_chrid[chr_id]->at(var_id);
+ }
+ if(var.qual < quality_threshold) continue;
+ variant_list.push_back(var);
+ }
+ if(chr_id == -1 || chr_id >= chrom_num){
+ cout << "[VarMatch] Error in matching single cluster" << endl;
+ continue;
+ }
+
+ MatchVariantListInThread(thread_index,
+ t,
+ chr_id,
+ variant_list,
+ cluster_id);
+
+ }
+
+ //if(method1 != method2){
+ // cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+ //}
+
+ }
+ return true;
+}
+
+
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+bool WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+ for(int i = 0; i < 2; i++){
+ if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+ // you need to make choices now
+ if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+ // no choice made at current pos
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+{
+
+ if(match_mode == 0){
+ //bool equal_sequences = false;
+ // same ref position, same donor length, same donor sequence, keep
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+ sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ for(int i = 0; i < 2; i++){
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }else{
+ if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+ if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+ sp.same_donor_len = true;
+ sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+ //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+ return 0;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+ //PrintPath(sp);
+ return -1;
+ }
+ }else{
+ sp.same_donor_len = false;
+ int min_donor_identical_len[2];
+ //for(int i = 0; i < 2; i++)
+ int i = 0;
+ {
+ // compare each strain
+ min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+ for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+ if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+ return -1;
+ }
+ }
+ sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+ }
+ return 0;
+ }
+ }
+}
+
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode){
+ //-1 operation fail, path deleted
+ //0 operation succeed
+ //1 operation fail, need to make decision first, then extend
+ //2 path reached end, need to check if good
+
+ if(sp.reached_sync_num >= sync_points.size()) return -1;
+
+ int start_pos = sp.current_genome_pos + 1;
+ int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+
+ for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+
+ // before make decision, we need to check if the equal property still holds
+ if(PathNeedDecision(sp, choices_by_pos, next_genome_pos)){
+
+ // check equal property
+ int statu = CheckPathEqualProperty(sp, match_mode);
+ if(statu == -1) return -1;
+ return 1; // need decision on next position
+ }
+
+ // else extend one nt
+ for(int i = 0; i < 4; i++){
+
+ if(match_mode == 1){
+ if(i%2 != 0) continue;
+ }
+
+ if(sp.string_sequences[i][next_genome_pos] == "."){
+ sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+ }else{
+ sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+ }
+ }
+ sp.current_genome_pos = next_genome_pos;
+ }
+
+ // reaches the end of end_pos
+ sp.reached_sync_num ++;
+
+ if(sp.reached_sync_num >= sync_points.size()){
+ // last sync point is the end of ref genome sequence
+ if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+ sp.donor_sequences[1] == sp.donor_sequences[3]){
+ return 2;
+ }else{
+ //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+ //PrintPath(sp);
+ return -1;
+ }
+ }
+ return CheckPathEqualProperty(sp, match_mode);
+ // first try to converge, then extend
+
+}
+
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme){
+ int score = 0;
+ if(score_unit == 0){
+ score = 1;
+ }else if(score_unit == 1){
+ if(match_mode == 0){
+ if(choice == -1){
+ score += ScoreEditDistance(dv, 0);
+ }else if(choice == 0){
+ score += ScoreEditDistance(dv, 0);
+ if(dv.multi_alts){
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, 0);
+ score += ScoreEditDistance(dv, 1);
+ }
+ }else{
+ score += ScoreEditDistance(dv, choice);
+ }
+ }
+
+ if(score_scheme == 0){
+ return score;
+ }else if(score_scheme == 1 || score_scheme == 2){
+ if(dv.flag == false && score_scheme == 1){
+ return score;
+ }else if(dv.flag && score_scheme == 2){
+ return score;
+ }else{
+ return 0;
+ }
+ }
+}
+
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // to maintain existance
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){
+ alts[1] = var.alts[1];
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.multi_alts){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ //for(int y = 0; y < 2; y++)
+ int y = 0;
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+ int c = var_choice[x].second;
+ alts[0] = var.alts[c];
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ int y = 0;
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(k < alts[y].length()){
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+ }
+ // else change nothing
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ // hence k == ref.length()-1, the last position
+ if(k < alts[y].length()){
+ string alt_part = alts[y].substr(k, alts[y].length()-k);
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.heterozygous){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ if(var.multi_alts){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }else{
+ candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+ }
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+// string temp_sequence = reference_sequence.substr(pos, 1);
+// path.string_sequences[x*2][pos] = temp_sequence;
+// path.string_sequences[x*2+1][pos] = temp_sequence;
+// }else{
+ // set score
+
+
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+
+ int c = var_choice[x].second;
+ if(c == -1){
+ alts[0] = ref;
+ alts[1] = var.alts[0];
+ }else{
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ alts[1] = alts[0];
+
+ if(var.multi_alts){
+ // choose 1 or 0
+ alts[1] = var.alts[1- c];
+ }else{
+ // c is 0, choose 0 or -1
+ if(var.heterozygous) alts[1] = ref;
+ }
+ }
+
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ ToUpper(alts[1]);
+ for(int y = 0; y < 2; y++){
+ // iterate two alts
+ string alt = alts[y];
+ vector<string> alt_vector;
+ GenerateAltVector(ref, alt, alt_vector);
+
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+
+ if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+ }
+ // else changes nothing
+
+ }
+ // hence k == ref.length()-1, the last position
+ assert(k == ref.length()-1);
+ string alt_part = alt_vector[k];
+ if(alt_part.length() > 0){
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alt_vector[k][0]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ // choice made
+ //dout << "after decision at pos " << pos << endl;
+ //PrintPath(path);
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+bool WholeGenome::PathMakeDecisionBackup(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme)
+{
+ int pos = sp.current_genome_pos+1;
+
+ vector<pair<int, int>> candidate_choices[2];
+ for(int i = 0; i < 2; i++){
+
+ // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+ candidate_choices[i].push_back(pair<int, int>(-1, -1));
+ // in this position, make choice of not use any variants, no matter if there is variant
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = choices_by_pos[i]->equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int var_index = (*it).second;
+ DiploidVariant var = variant_list[var_index];
+ //PrintVariant(var);
+
+ // check if current var influence
+ string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+ string alts[2];
+ alts[0] = var.alts[0];
+ alts[1] = alts[0];
+ if(var.multi_alts){
+ alts[1] = var.alts[1];
+ }else if(var.heterozygous){
+ alts[1] = ref;
+ }
+
+ // not just purely consider if a vqriant can be applied, but if a choice
+
+ bool choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ choice_applicable = false;
+ break;
+ }else{
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+ }
+
+ if(var.heterozygous){
+
+ //if heterozygous, then there is another choice, check if it is applicable
+ string temp = alts[0];
+ alts[0] = alts[1];
+ alts[1] = temp;
+
+ choice_applicable = true;
+ for(int k = 0; k < ref.length(); k++){
+ // for each ref char
+ for(int y = 0; y < 2; y++){
+ // for each strain
+ if(sp.string_sequences[i*2+y][k+pos] != "."){
+ // decision in this area has already been made
+ if(k >= alts[y].length()){
+ // should be a deletion
+ choice_applicable = false;
+ break;
+ }else{
+ // should be equal at current position
+ // can be an insertion, as long as current position is the same
+ if(ref[k] != alts[y][k]){
+ choice_applicable = false;
+ break;
+ }
+ }
+ }
+ }
+ if(!choice_applicable) break;
+ }
+
+ if(choice_applicable){
+ if(var.multi_alts){
+ candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+ }else{
+ candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+ }
+ }
+ }
+ }
+ }
+
+ //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+
+ for(int i = 0; i < candidate_choices[0].size(); i++){
+ for(int j = 0; j < candidate_choices[1].size(); j++){
+ // iterate all choices
+ SequencePath path = sp;
+ pair<int, int> var_choice[2];
+ var_choice[0] = candidate_choices[0][i];
+ var_choice[1] = candidate_choices[1][j];
+ for(int x = 0; x < 2; x++){
+ // iterate truth and predict
+ int var_index = var_choice[x].first;
+ if(var_index != -1){
+// string temp_sequence = reference_sequence.substr(pos, 1);
+// path.string_sequences[x*2][pos] = temp_sequence;
+// path.string_sequences[x*2+1][pos] = temp_sequence;
+// }else{
+ // set score
+
+
+ DiploidVariant var = variant_list[var_index];
+ // if(var.flag != x){
+ // dout << "Error" << endl;
+ // }
+ string ref = var.ref;
+ string alts[2];
+
+ int c = var_choice[x].second;
+ if(c == -1){
+ alts[0] = ref;
+ alts[1] = var.alts[0];
+ }else{
+ // c == 0 or 1
+ alts[0] = var.alts[c];
+ alts[1] = alts[0];
+
+ if(var.multi_alts){
+ // choose 1 or 0
+ alts[1] = var.alts[1- c];
+ }else{
+ // c is 0, choose 0 or -1
+ if(var.heterozygous) alts[1] = ref;
+ }
+ }
+
+ path.score += CalculateScore(var,
+ c,
+ score_unit,
+ match_mode,
+ score_scheme);
+
+ ToUpper(ref);
+ ToUpper(alts[0]);
+ ToUpper(alts[1]);
+ for(int y = 0; y < 2; y++){
+ // iterate two alts
+ int k = 0;
+ for(; k < ref.length()-1; k++){
+ if(k < alts[y].length()){
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+ }
+ // else change nothing
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ // hence k == ref.length()-1, the last position
+ if(k < alts[y].length()){
+ string alt_part = alts[y].substr(k, alts[y].length()-k);
+ if(alt_part.length() > 1){
+ if(alt_part[0] == ref[k]){
+ if(path.string_sequences[x*2+y][pos+k] == "."){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }else{
+ path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }else{
+ if(ref[k] != alts[y][k]){
+ path.string_sequences[x*2+y][pos+k] = alt_part;
+ }
+ }
+ }else{
+ path.string_sequences[x*2+y][pos+k] = "";
+ }
+ }
+ }
+ path.choice_made[x][pos] = var_choice[x];
+ }
+ // choice made
+ //dout << "after decision at pos " << pos << endl;
+ //PrintPath(path);
+ sequence_path_list.push_back(path);
+ }
+ }
+
+ //expected number of inserted paths are 2,3,4,6,x...
+ return true;
+}
+
+void WholeGenome::PrintPath(SequencePath & sp){
+ cout << "- Sequence Path:" << endl;
+ cout << "@ String Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ for(int j = 0; j < sp.string_sequences[i].size(); j++){
+ cout << sp.string_sequences[i][j] << " ";
+ }
+ cout << endl;
+ }
+ cout << "@ Donor Sequences:" << endl;
+ for(int i = 0; i < 4; i++){
+ cout << sp.donor_sequences[i] << endl;
+ }
+ cout << "@ Removable: " << sp.removable << endl;
+}
+
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int threshold_index){
+ //--------------for unit test------------------------------
+ //dout << variant_list.size() << endl;
+
+ //int chr_id = 0;
+ //-------------end unit test-------------------------------
+
+
+
+ // so a legal sync_points vector contains at least two
+ // first is the end of variant, there should be at least one variant
+ // second is the end of subsequence, there should be at least one nt not influenced by a variant
+
+ list<SequencePath> current_path_list;
+ list<SequencePath> next_path_list;
+ SequencePath sp(subsequence.length());
+ SequencePath best_path = sp;
+ current_path_list.push_back(sp);
+ while(current_path_list.size() != 0){
+ bool reach_sync_point = true;
+ while(current_path_list.size() != 0){
+ SequencePath path = current_path_list.front();
+ current_path_list.pop_front();
+ //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+ //PrintPath(path);
+ int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode);
+ //if(cluster_index == 220730) PrintPath(path);
+ if(is_extend == -1){
+ continue;
+ }
+ else if(is_extend == 0){
+ next_path_list.push_back(path);
+ // here the path is supposed to reach the next sync point
+ }else if(is_extend == 1){
+ if(match_mode == 0){
+ PathMakeDecision(path,
+ variant_list,
+ choices_by_pos,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme);
+ }else{
+ PathMakeDecisionNoGenotype(path,
+ variant_list,
+ choices_by_pos,
+ current_path_list,
+ subsequence,
+ score_unit,
+ match_mode,
+ score_scheme);
+ }
+ }else if(is_extend == 2){
+ if(path.score > best_path.score){
+ best_path = path; // only when you reach the very end can you be considered as best path
+ //PrintPath(best_path);
+ }
+ }
+ }
+ current_path_list = next_path_list;
+ next_path_list.clear();
+ if(current_path_list.size() > 0){
+ //int current_genome_pos = current_path_list.front().current_genome_pos;
+ // after revise, we do not need this check
+ //if(sync_points.find(current_genome_pos) != sync_points.end()){
+ //dout << "converge paths at position: " << current_genome_pos << endl;
+ //dout << "before converge: " << current_path_list.size() << endl;
+ ConvergePaths(current_path_list);
+ //dout << "after converge: " << current_path_list.size() << endl;
+ //}
+ }
+ }
+ // print best_path
+ if(best_path.score <= 0) return false;
+
+ //dout << "new method: " << best_path.score << endl;
+
+ //==========================output ======================
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+
+ if(match_mode == 0){
+ ConstructMatchRecord(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index,
+ threshold_index);
+ }else{
+ ConstructMatchRecordNoGenotype(best_path,
+ variant_list,
+ subsequence,
+ offset,
+ thread_index,
+ chr_id,
+ mode_index,
+ threshold_index);
+ }
+ return true;
+}
+
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool need_match_record = false;
+
+ if (threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+
+ if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[1];
+
+ int parsimonious_pos = offset;
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ if (phasing == -1) phasing = 1;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1";
+ if(variant.heterozygous){
+ if(variant.multi_alts){
+ phasing_string += "|2";
+ }else{
+ phasing_string += "|0";
+ }
+ }else{
+ phasing_string += "|1";
+ }
+ }else if(phasing == 1){
+ if(variant.multi_alts){
+ phasing_string += "2|1";
+ }else{
+ phasing_string += "0|1";
+ }
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+}
+
+
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index){
+ int truth_num = 0;
+ int predict_num = 0;
+
+ bool need_match_record = false;
+ if(threshold_index == 0) need_match_record = true;
+
+ bool multiple_match = false;
+ string parsimonious_ref = subsequence;
+ string parsimonious_alt0 = best_path.donor_sequences[0];
+ string parsimonious_alt1 = best_path.donor_sequences[0];
+
+ int parsimonious_pos = offset;
+
+// NormalizeVariantSequence(offset,
+// parsimonious_ref,
+// parsimonious_alt0,
+// parsimonious_alt1,
+// chr_id);
+
+ string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+ //if(multiple_match) match_record += "/" + parsimonious_alt1;
+
+ string vcf_record[2];
+ string phasing_record[2];
+
+ for (int i = 0; i < 2; i++) {
+ for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+ pair<int, int> selection = it->second;
+ int phasing = selection.second;
+ if(selection.first == -1) continue;
+ if (phasing == -1) continue;
+ DiploidVariant variant = variant_list[selection.first];
+ if(!variant.flag){
+ truth_num++;
+ }else{
+ predict_num++;
+ }
+
+ if(need_match_record){
+ string alt_string = variant.alts[0];
+ if(variant.multi_alts){
+ alt_string += "/" + variant.alts[1];
+ }
+ string phasing_string = "";
+ if(phasing == 0){
+ phasing_string += "1|1";
+ }else if(phasing == 1){
+ phasing_string += "2|2";
+ }
+ string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+ vcf_record[i] += variant_record;
+ phasing_record[i] += phasing_string;
+ vcf_record[i] += ";";
+ phasing_record[i] += ";";
+ }
+ }
+ if(need_match_record){
+ vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+ phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+ }
+
+ }
+
+ if(need_match_record){
+ match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+ match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+ match_record += "\t" + to_string(best_path.score) + "\n";
+
+ //complex_match_records[thread_index]->push_back(match_record);
+ match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+ }
+
+ baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+ query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+}
+
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+ bool truth_same = false;
+ bool query_same = false;
+
+ if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+ truth_same = true;
+ }
+ else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+ a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+ truth_same = true;
+ }
+
+
+ if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+ query_same = true;
+ }
+ else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+ a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+ query_same = true;
+ }
+
+ if(truth_same && query_same) return true;
+ return false;
+}
+
+bool IsRemovable(SequencePath & s){ return s.removable;}
+
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+ //dout << "===========start converge===================" << endl;
+ int path_num = path_list.size();
+ if(path_num <= 1) return;
+ for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+ SequencePath ref_path = *i;
+ if(ref_path.removable) continue;
+ if(!ref_path.same_donor_len) continue;
+ list<SequencePath>::iterator j = i;
+ ++j;
+ for(; j != path_list.end(); ++j){
+ SequencePath que_path = *j;
+ if(que_path.removable) continue;
+ if(!que_path.same_donor_len) continue;
+ //dout << "Comparing following paths: " << endl;
+ //PrintPath(ref_path);
+ //PrintPath(que_path);
+ if(DonorLengthEqual(ref_path, que_path)){
+ if(ref_path.score >= que_path.score){
+ (*j).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*j));
+ }else{
+ (*i).removable = true;
+ //dout << "delete path: " << endl;
+ //PrintPath((*i));
+ break;
+ }
+ }
+ //dout << "- - - - - - - - - -" << endl;
+ }
+ }
+
+ path_list.remove_if(IsRemovable);
+}
+
+int WholeGenome::test() {
+ genome_sequences[0] = "GTCAGCCGG";
+ DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+ DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+ DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+ DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+ DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+ DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+ DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+ DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+
+ //complex_match_records = new vector<string>*[1];
+ //complex_match_records[0] = new vector<string>;
+ //vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+ vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+ //cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+ //cout << complex_match_records[0]->at(0) << endl;
+ return 0;
+}
+
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+ int start = 0;
+ int cluster_number = variants_by_cluster.size(); // cluster number
+ int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+ int cluster_step = cluster_number / thread_num; // assign clusters to threads
+ if (cluster_step * thread_num < cluster_number) cluster_step++;
+ int end = start + cluster_step;
+ //initialize vector size
+ //complex_match_records = new vector<string>*[thread_num];
+ match_records_by_mode_by_thread = new vector<string>**[thread_num];
+
+ //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+ match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ match_records_by_mode_by_thread[i][j] = new vector<string>;
+ }
+ }
+
+ baseline_total_match_num = new vector<int>** [thread_num];
+ query_total_match_num = new vector<int> ** [thread_num];
+
+ for(int i = 0; i < thread_num; i++){
+
+ baseline_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+ query_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+
+ for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+ baseline_total_match_num[i][j] = new vector<int>;
+ baseline_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+ query_total_match_num[i][j] = new vector<int>;
+ query_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+ }
+ }
+
+ vector<thread> threads;
+ //spawn threads
+ unsigned i = 0;
+ for (; i < thread_num - 1; i++) {
+ threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+ start = end;
+ end = start + cluster_step;
+ }
+ // also you need to do a job in main thread
+ // i equals to (thread_num - 1)
+ if (i != thread_num - 1) {
+ dout << "[Error] thread number not match" << endl;
+ }
+ if (start >= variants_by_cluster.size()) {
+ dout << "[Error] index out of map range" << endl;
+ }
+ else {
+ ClusteringMatchInThread(start, end, i);
+ }
+
+ // call join() on each thread in turn before this function?
+ std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+
+ //output all results
+ cout << "writing results..." << endl;
+ ofstream output_stat_file;
+ output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+
+ cout << "=========VarMatch Result Stat.=======" << endl;
+ string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tqual_threshold\tbaseline_match_num\tquery_match_num\tquery_total_num";
+ cout << stat_head_string << endl;
+ output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+ output_stat_file << "##Query:"<< query_variant_total_num << endl;
+ output_stat_file << stat_head_string << endl;
+
+ int score_unit;
+ int match_mode;
+ int score_scheme;
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ int total_ref_complex = 0;
+ int total_que_complex = 0;
+
+ string threshold_string = "";
+ string baseline_match_num_string = "";
+ string query_match_num_string = "";
+ string query_total_num_string = "";
+
+ for(int t = 0; t < threshold_num; t++){
+
+ threshold_string += to_string(threshold_list[t]);
+
+ int baseline_match_num_by_threshold_by_mode = 0;
+ int query_match_num_by_threshold_by_mode = 0;
+
+ for(int i = 0; i < thread_num; i++){
+ baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i][t]->at(mode_index);
+ query_match_num_by_threshold_by_mode += query_total_match_num[i][t]->at(mode_index);
+ }
+
+ baseline_match_num_string += to_string(baseline_match_num_by_threshold_by_mode);
+ query_match_num_string += to_string(query_match_num_by_threshold_by_mode);
+ query_total_num_string += to_string((int)(query_variant_total_num * (1-per_list[t])) );
+
+ if(t < threshold_num-1){
+ threshold_string += ",";
+ baseline_match_num_string += ",";
+ query_match_num_string += ",";
+ query_total_num_string += ",";
+ }
+
+ }
+
+ string total_match_num_string = to_string(score_unit) + "\t" +
+ to_string(match_mode) + "\t" +
+ to_string(score_scheme) + "\t" +
+ threshold_string + "\t" +
+ baseline_match_num_string + "\t" +
+ query_match_num_string + "\t" +
+ query_total_num_string;// + "\t" + to_string(mode_index);
+ cout << total_match_num_string << endl;
+ output_stat_file << total_match_num_string << endl;
+ }
+ }
+ }
+ output_stat_file.close();
+
+ int bench_mode_index = GetIndexFromMatchScore(0, 0, 0);
+
+ for(int x = 0; x < score_unit_list.size(); x++){
+ score_unit = score_unit_list[x];
+ for(int y = 0; y < match_mode_list.size(); y++){
+ match_mode = match_mode_list[y];
+ for(int z = 0; z < score_scheme_list.size(); z++){
+ score_scheme = score_scheme_list[z];
+ int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+ string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+
+ ofstream output_complex_file;
+ output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+
+ output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+ output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+ output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+
+ for(int i = 0; i < thread_num; i++){
+ for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+ if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+ //if(match_records_by_mode_by_thread[i][mode_index]->at(k)[0] == '$'){
+ //int bench_mode_index = stoi(match_records_by_mode_by_thread[i][mode_index]->at(k).erase(0,1));
+ //output_complex_file << match_records_by_mode_by_thread[i][0]->at(k);
+ //}else{
+ output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+ //}
+ }
+ }
+ }
+ output_complex_file.close();
+ }
+ }
+ }
+
+ // clear all matching records
+ for(int i = 0; i < thread_num; i++){
+ for(int j = 0; j < MATCH_MODE_NUM; j++){
+ delete match_records_by_mode_by_thread[i][j];
+ }
+ for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+ delete baseline_total_match_num[i][j];
+ delete query_total_match_num[i][j];
+ }
+ delete[] match_records_by_mode_by_thread[i];
+ delete[] baseline_total_match_num[i];
+ delete[] query_total_match_num[i];
+ }
+ delete[] match_records_by_mode_by_thread;
+ delete[] baseline_total_match_num;
+ delete[] query_total_match_num;
+
+}
+
+
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+
+ int left_index = pos;
+ if (genome_sequences[chr_id].size() == 0) return -1;
+ if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+
+ bool change_in_allels = true;
+ while (change_in_allels) {
+ change_in_allels = false;
+ if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+ if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+ parsimonious_ref.pop_back();
+ parsimonious_alt0.pop_back();
+ parsimonious_alt1.pop_back();
+ change_in_allels = true;
+ }
+ // else do not make further changes
+ }
+ if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+ left_index--;
+ char left_char = toupper(genome_sequences[chr_id][left_index]);
+ parsimonious_ref = left_char + parsimonious_ref;
+ parsimonious_alt0 = left_char + parsimonious_alt0;
+ parsimonious_alt1 = left_char + parsimonious_alt1;
+ }
+ }
+ while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+ parsimonious_ref[0] == parsimonious_alt1[0] &&
+ parsimonious_ref.size() > 1 &&
+ parsimonious_alt0.size() > 1 &&
+ parsimonious_alt1.size() > 1)
+ {
+ parsimonious_ref.erase(0, 1);
+ parsimonious_alt0.erase(0, 1);
+ parsimonious_alt1.erase(0, 1);
+ left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+ }
+ return left_index;
+}
+
+void WholeGenome::SingleThreadClustering(int chr_id) {
+ int ins_len[2] = { 0 };
+ int del_len[2] = { 0 };
+ int c_start = 0;
+ int c_end = 0;
+ sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+ sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+ int ref_size = ref_variant_by_chrid[chr_id]->size();
+ int que_size = que_variant_by_chrid[chr_id]->size();
+ //dout << chr_id << "," << ref_size << "," << que_size << endl;
+
+ int ref_index = 0;
+ int que_index = 0;
+ bool not_first = false;
+ DiploidVariant snp;
+ vector<VariantIndicator> vi_list;
+ while (ref_index < ref_size || que_index < que_size) {
+ bool take_que = true;
+ if(ref_index < ref_size && que_index < que_size){
+ if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+ take_que = false;
+ }
+ }else if(ref_index < ref_size){
+ take_que = false;
+ }
+ int var_index;
+ if(take_que){
+
+ snp = que_variant_by_chrid[chr_id]->at(que_index);
+ //cout << "q |" << que_index << "," << snp.pos << endl;
+ var_index = que_index;
+ que_index++;
+ }else{
+ snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+ //cout << "r |" << ref_index << "," << snp.pos << endl;
+ var_index = ref_index;
+ ref_index++;
+ }
+ // check if need to separator clusters
+ if (not_first) {
+ c_end = snp.pos;
+ if (c_end - c_start >= 2) {
+ int separator_length = c_end - c_start;
+ string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+ int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+ bool separate_cluster = false;
+ if(max_change == 0){
+ separate_cluster = true;
+ }
+ else if (separator_length > 2 * max_change &&
+ (separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+ {
+ separate_cluster = true;
+ }
+
+ if(separate_cluster){
+ variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+ vi_list.clear();
+ ins_len[0] = 0;
+ del_len[0] = 0;
+ ins_len[1] = 0;
+ del_len[1] = 0;
+ c_start = 0; // re-assign c_start
+ }
+ }
+ }
+ c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+ VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+ vi_list.push_back(current_variant_indicator);
+ //cluster_vars_map[cluster_index].push_back(snp);
+ if(!not_first) not_first = true;
+ int ref_length = (int)(snp.ref.length());
+ int flag = 0;
+ if(snp.flag) flag = 1;
+// DiploidVariant snp = front_cluster[k];
+// int rq = snp.flag;
+ ins_len[flag] += snp.mil;
+ del_len[flag] += snp.mdl;
+ }
+ if(vi_list.size() > 0){
+ variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+ }
+}
+
+int WholeGenome::ReadReferenceVariants(string filename){
+ return ReadWholeGenomeVariant(filename, false);
+}
+
+int WholeGenome::ReadQueryVariants(string filename){
+ return ReadWholeGenomeVariant(filename, true);
+}
+
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+
+ ReadWholeGenomeSequence(genome_seq);
+ baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+ ref_vcf_filename = ref_vcf;
+
+}
+
+void WholeGenome::Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_)
+{
+ // initialize query variant data structure
+
+ if(score_scheme_indicator == 3){
+ DirectMatch(ref_vcf_filename, query_vcf);
+ return;
+ }
+ que_vcf_filename = query_vcf;
+
+ this->output_prefix = output_prefix;
+ this->detail_results = detail_results;
+
+ score_unit_indicator = score_unit_;
+ match_mode_indicator = match_mode_;
+ score_scheme_indicator = score_scheme_;
+
+ query_variant_total_num = ReadQueryVariants(query_vcf);
+
+ if(score_unit_indicator == -1){
+ score_unit_list.push_back(0);
+ score_unit_list.push_back(1);
+ }else{
+ score_unit_list.push_back(score_unit_indicator);
+ }
+
+ if(match_mode_indicator == -1){
+ match_mode_list.push_back(0);
+ match_mode_list.push_back(1);
+ }else{
+ match_mode_list.push_back(match_mode_indicator);
+ }
+
+ if(score_scheme_indicator == -1){
+ score_scheme_list.push_back(0);
+ score_scheme_list.push_back(1);
+ score_scheme_list.push_back(2);
+ }else{
+ score_scheme_list.push_back(score_scheme_indicator);
+ }
+
+ for(int i = 0; i < score_unit_list.size(); i++){
+ for(int j = 0; j < match_mode_list.size(); j++){
+ for(int k = 0; k < score_scheme_list.size(); k++){
+ int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+ mode_index_list.push_back(mode_index); // so that I can directly know how many mode, do not need to calculate all the time
+ }
+ }
+ }
+
+ cout << "Baseline VCF: " << ref_vcf_filename << endl;
+ cout << "Query VCF: " << query_vcf << endl;
+ cout << "========VCF Stat.==========" << endl;
+ cout << "Total Number of VCF Entries: " << endl;
+ cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+
+ cout << "parallel clustering..." << endl;
+ ParallelClustering();
+
+ cout << "matching variants..." << endl;
+ ClusteringMatchMultiThread();
+
+ // most clustering results are cleared inside ParallelClustering function except the following one
+ // which is needed for matching
+ variants_by_cluster.clear();
+ // clean at the end of function
+ for(int j = 0; j < chrom_num; j++){
+ que_variant_by_chrid[j]->clear();
+ //delete que_variant_by_chrid[j];
+ }
+ //delete[] que_variant_by_chrid;
+
+ query_variant_strings.clear();
+ query_variant_total_num = 0;
+ threshold_list.clear();
+ threshold_num = 0;
+ // The following three matching results are cleared inside ClusteringMatchMultiThread function
+ // match_records_by_mode_by_thread;
+ // baseline_total_match_num;
+ // query_total_match_num;
+
+ return;
+}
+
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf)
+{
+ //dout << "direct match" << endl;
+ int ref_variant_num = ReadReferenceVariants(ref_vcf);
+ int que_variant_num = ReadQueryVariants(query_vcf);
+ dout << ref_variant_num << "," << que_variant_num << endl;
+ int match_num = 0;
+ for(int i = 0; i < chrom_num; i++){
+ if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+ continue;
+ //[TODO] not the right way to do it, at least need multimap
+ multimap<int, int> ref_variant_by_pos;
+ for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ ref_variant_by_pos.insert(pair<int, int>(pos, j));
+ }
+
+ for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+ DiploidVariant var = que_variant_by_chrid[i]->at(j);
+ int pos = var.pos;
+ if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+ continue;
+
+ pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+ var_range = ref_variant_by_pos.equal_range(pos);
+
+ for(auto it = var_range.first; it != var_range.second; ++it){
+ int ref_index = (*it).second;
+ DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+ if (match_mode_indicator != 1 && var == ref_var){
+ match_num ++;
+ break;
+ }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+ match_num ++;
+ break;
+ }
+ }
+ }
+ }
+ dout << "matched variants: " << match_num << endl;
+}
diff --git a/src/wholegenome_working.h b/src/wholegenome_working.h
new file mode 100644
index 0000000..41a7cce
--- /dev/null
+++ b/src/wholegenome_working.h
@@ -0,0 +1,292 @@
+#pragma once
+
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+
+typedef struct VariantIndicator{
+ VariantIndicator(int chr_id_ = -1,
+ int var_id_ = -1,
+ bool refer_ = true) :
+ chr_id(chr_id_),
+ var_id(var_id_),
+ refer(refer_){}
+
+ char chr_id;
+ int var_id;
+ bool refer;
+}VariantIndicator;
+
+typedef struct Interval {
+ int start;
+ int end;
+ Interval() : start(0), end(0) {}
+ Interval(int s, int e) : start(s), end(e) {}
+}Interval;
+
+class SequencePath{
+public:
+ SequencePath(int n)
+ {
+ reference_length = n;
+ for(int i = 0; i < 4; i++){
+ string_sequences[i].resize(n, ".");
+ // default value is "."
+ donor_sequences[i] = "";
+ }
+ current_genome_pos = -1;
+ score = 0;
+ removable = false;
+ same_donor_len = false;
+ current_equal_donor_pos[0] = -1;
+ current_equal_donor_pos[1] = -1;
+ reached_sync_num = 0;
+ }
+ int reference_length;
+ vector<string> string_sequences[4];
+ map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+ // one choice is a pair: variant id, phasing index
+ int current_genome_pos;
+ string donor_sequences[4];
+ int current_equal_donor_pos[2];
+ int score;
+ bool removable;
+ bool same_donor_len;
+ int reached_sync_num;
+};
+
+class WholeGenome{
+private:
+ int chrom_num;
+ int thread_num;
+ string ref_vcf_filename;
+ string que_vcf_filename;
+ int baseline_variant_total_num;
+ int query_variant_total_num;
+ vector<string> baseline_variant_strings;
+ vector<string> query_variant_strings;
+ bool detail_results;
+
+ //int thread_num; VCF->DiploidVariant->WholeGenome
+protected:
+ map<string, int> chrid_by_chrname;
+ map<int, string> chrname_by_chrid;
+ map<string, int> chrname_dict;
+ map<int, string> genome_sequences;
+ vector<DiploidVariant> ** ref_variant_by_chrid;
+ vector<DiploidVariant> ** que_variant_by_chrid;
+ vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+ // so here cluster is represented as vector<vector<VariantIndicator>>
+ // and we create a list of pointers point to cluster
+ // and we hold the point to that list
+
+ vector<vector<VariantIndicator>> variants_by_cluster;
+
+ vector<string> *** match_records_by_mode_by_thread;
+ //vector<int> *** baseline_matches_by_mode_by_thread;
+ //vector<int> *** query_matches_by_mode_by_thread;
+ vector<int> *** baseline_total_match_num;
+ vector<int> *** query_total_match_num;
+
+ //map<float, int> *** tp_qual_num_by_mode_by_thread;
+ //map<float, int> *** fp_qual_num_by_mode_by_thread;
+
+ //map<float, int> query_total_qual_num;
+
+ string output_prefix;
+ string output_dir;
+ // copy the above into this.
+
+ int score_unit_indicator;
+ int match_mode_indicator;
+ int score_scheme_indicator;
+
+ vector<int> score_unit_list;
+ vector<int> match_mode_list;
+ vector<int> score_scheme_list;
+ vector<int> mode_index_list;
+
+ vector<double> threshold_list;
+ int threshold_num;
+
+ vector<float> per_list;
+
+ bool ReadWholeGenomeSequence(string filename);
+ bool ReadGenomeSequenceList(string filename);
+ int ReadWholeGenomeVariant(string filename, bool flag);
+ bool ReadVariantFileList(string filename);
+ int ReadReferenceVariants(string filename);
+ int ReadQueryVariants(string filename);
+ bool ParallelClustering(); // parallel by chr id
+ bool ParallelMatching(); // parallel by task
+ bool TBBMatching();
+
+ void SingleThreadClustering(int chr_id);
+ //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+
+ //override
+ bool ClusteringMatchInThread(int start, int end, int thread_index);
+ void ClusteringMatchMultiThread();
+ int NormalizeVariantSequence(int pos,
+ string & parsimonious_ref,
+ string & parsimonious_alt0,
+ string & parsimonious_alt1,
+ int chr_id);
+
+ struct compInterval {
+ bool operator()(const Interval &a, const Interval &b) const {
+ return a.start<b.start;
+ }
+ };
+
+ vector<Interval> merge(vector<Interval> &intervals) {
+ sort(intervals.begin(),intervals.end(),compInterval());
+ vector<Interval> results;
+ for(int i=0; i<intervals.size(); i++) {
+ if(results.empty() || results.back().end < intervals[i].start) // no overlap
+ results.push_back(intervals[i]);
+ else // overlap
+ results.back().end = max(results.back().end, intervals[i].end);
+ }
+ return results;
+ }
+
+ bool PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+ int PathExtendOneStep(SequencePath& sp,
+ multimap<int, int> * choices_by_pos[],
+ const string & reference_sequence,
+ vector<int> & sync_points,
+ int match_mode);
+
+ bool PathMakeDecision(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool PathMakeDecisionBackup(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ bool MatchingSingleClusterBaseExtending(int cluster_index,
+ int thread_index,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ multimap<int, int> * choices_by_pos[],
+ vector<int> & sync_points,
+ int chr_id,
+ int score_unit,
+ int match_mode,
+ int score_scheme,
+ int threshold_index);
+
+ bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+ void ConvergePaths(list<SequencePath> & path_list);
+ int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+
+ int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+ int EditDistance(const std::string& s1, const std::string& s2);
+ bool PathMakeDecisionNoGenotype(SequencePath& sp,
+ vector<DiploidVariant> & variant_list,
+ multimap<int, int> * choices_by_pos[],
+ list<SequencePath> & sequence_path_list,
+ const string & reference_sequence,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ void ConstructMatchRecord(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+ vector<DiploidVariant> & variant_list,
+ string & subsequence,
+ int offset,
+ int thread_index,
+ int chr_id,
+ int mode_index,
+ int threshold_index);
+
+ int CalculateScore(DiploidVariant & dv,
+ int choice,
+ int score_unit,
+ int match_mode,
+ int score_scheme);
+
+ int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+ bool ClearQuery();
+
+ inline void ToUpper(string & s){
+ transform(s.begin(), s.end(), s.begin(), ::toupper);
+ }
+
+ bool CheckTandemRepeat(string sequence, int unit_threshold);
+
+ bool MatchVariantListInThread(int thread_index,
+ int threshold_index,
+ int chr_id,
+ vector<DiploidVariant> & variant_list,
+ int cluster_id);
+
+ void initialize_score_matrix(int **score, char **trackBack, int M, int N);
+ int needleman_wunsch(string S1, string S2, string &R1, string &R2);
+ void GenerateAltVector(string ref, string alt, vector<string> & alt_vector);
+
+public:
+ WholeGenome(int thread_num_,
+ string output_dir_,
+ bool pr_curves_);
+
+ ~WholeGenome();
+
+ void ReadRef(string genome_seq,
+ string ref_vcf);
+
+ void Compare(string query_vcf,
+ string output_prefix,
+ bool detail_results,
+ int score_unit_,
+ int match_mode_,
+ int score_scheme_);
+
+ void DirectMatch(string ref_vcf,
+ string query_vcf);
+
+ int test(); // for direct test
+ void PrintPath(SequencePath & sp);
+
+ const static int MATCH_MODE_NUM = 16;
+ const static int VAR_LEN = 100;
+ const static int MAX_REPEAT_LEN = 1000;
+ const static int ROC_SAMPLE_NUM = 5;
+};
diff --git a/stat b/stat
new file mode 100644
index 0000000..c07befc
--- /dev/null
+++ b/stat
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+import sys
+from sys import argv
+import argparse
+
+citation = 'Please cite our paper'
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-s', metavar='simple.vcf', help='direct match vcf result')
+parser.add_argument('-c', metavar='complex.vcf', help='clustering match vcf result')
+
+if(args.s is not None):
+ print('Analysis direct match vcf results')
+ with open(args.s) as simple_vcf:
+ for line in simple_vcf.readlines():
+ if line.startswith('#'):
+ continue
+ columns = line.split('\t')
+ ref =
+
diff --git a/vardiff b/vardiff
new file mode 100644
index 0000000..bc585f6
--- /dev/null
+++ b/vardiff
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Paul Medvedev(pashadag at cse.psu.edu)
+ Chen Sun(chensun at cse.psu.edu)
+"""
+
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="vardiff", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File', help='.match file list')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./match_diff_output')
+args = parser.parse_args()
+
+pos_donor_dict_list = []
+pos_content_dict_list = []
+
+html_head = """
+<html>
+<head>
+<style type="text/css">
+table, th, td {
+ border: 1px solid black;
+ border-collapse: collapse;
+}
+th, td {
+ padding: 5px;
+ text-align: left;
+}
+
+.separate_line{
+ padding: 0 20px 0;
+ margin: 20px 0;
+ line-height: 40px;
+ border-left: 200px solid #ddd;
+ border-right: 200px solid #ddd;
+ text-align: center;
+ font-size:100%;
+}
+
+
+div{
+ font-family: "Courier New", Courier, monospace !important;
+ font-size:100%;
+ line-height:0px;
+ white-space:nowrap;
+}
+
+.selected{
+ color: blue;
+}
+
+.discard{
+ color: gray;
+}
+
+.box{
+ border-left: 4px solid #ccc;
+ border-top: 4px solid #ccc;
+ border-bottom: 4px solid #ccc;
+ padding:20px;
+ margin-top: 10px;
+ margin-bottom: 100px;
+ margin-left:10px;
+}
+#advhelp{
+ display:none;
+}
+#advhelp:target{
+ display:block;
+}
+</style>
+</head>
+<body>
+"""
+
+html_tail="""
+</body>
+</html>
+"""
+
+
+def read_match_file(filename):
+ pos_set = set()
+ pos_donor_dict = {}
+ pos_content_dict = {}
+ with open(filename) as file:
+ for line in file:
+ if line.startswith('#'):
+ continue
+ columns = line.split('\t')
+ if len(columns) < 2:
+ print line
+ pos = columns[0]+'_'+columns[1]
+ pos_set.add(pos)
+ donor = columns[3]
+ pos_donor_dict[pos] = donor
+ pos_content_dict[pos] = line
+ return pos_donor_dict, pos_content_dict, pos_set
+
+
+def levenshtein(s1, s2):
+ if len(s1) < len(s2):
+ return levenshtein(s2, s1)
+
+ # len(s1) >= len(s2)
+ if len(s2) == 0:
+ return len(s1)
+
+ previous_row = range(len(s2) + 1)
+ for i, c1 in enumerate(s1):
+ current_row = [i + 1]
+ for j, c2 in enumerate(s2):
+ insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
+ deletions = current_row[j] + 1 # than s2
+ substitutions = previous_row[j] + (c1 != c2)
+ current_row.append(min(insertions, deletions, substitutions))
+ previous_row = current_row
+
+ return previous_row[-1]
+
+
+def separate_string(s):
+ l = list(s)
+ return ' '.join(l)
+
+
+def parse_variant(offset, variant_string, phasing_string, is_baseline):
+ variant_content_list = ['<hr>\n']
+
+ variant_component = variant_string.split(',')
+ variant_pos = int(variant_component[0])
+ variant_prefix = ' '*(variant_pos - offset)
+ variant_ref = variant_component[1]
+ variant_alts = variant_component[2].split('/')
+ phasing_component = phasing_string.split('|')
+
+ if is_baseline:
+ if '0' in phasing_component:
+ variant_content_list.append('<pre class="selected">Baseline REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+ else:
+ variant_content_list.append('<pre>Baseline REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+ else:
+ if '0' in phasing_component:
+ variant_content_list.append('<pre class="selected">Query REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+ else:
+ variant_content_list.append('<pre>Query REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+
+ for i in range(2):
+ j = i
+ #print variant_alts, phasing_component
+ if len(variant_alts) < 2:
+ j = 0
+ if str(j+1) in phasing_component:
+ variant_content_list.append('<pre class="selected"> ALT:' + separate_string(variant_prefix + variant_alts[j]) + '</pre>\n')
+ else:
+ variant_content_list.append('<pre class="discard"> ALT:' + separate_string(variant_prefix + variant_alts[j]) + '</pre>\n')
+
+ return variant_content_list
+
+
+def parse_match(match_string, filename):
+ matching_content_list = ['<div class="separate_line" >' + filename +'</div>\n']
+
+ match_columns = match_string.split('\t')
+ offset = int(match_columns[1])
+ match_ref = match_columns[2]
+ match_donors = match_columns[3].split('/')
+ matching_content_list.append('<pre>Genome Ref:'+ separate_string(match_ref) +'</pre>\n')
+ matching_content_list.append('<pre> Donor 0:'+ separate_string(match_donors[0]) +'</pre>\n')
+ if len(match_donors) > 1:
+ matching_content_list.append('<pre> Donor 1:'+ separate_string(match_donors[1]) +'</pre>\n')
+ else:
+ matching_content_list.append('<pre> Donor 1:'+ separate_string(match_donors[0]) +'</pre>\n')
+
+ if match_columns[4] == '.':
+ return matching_content_list
+
+ for i in range(2):
+ matching_variants = match_columns[4+i].split(';')
+ matching_phasing = match_columns[6+i].split(';')
+ is_baseline = True
+ if i == 1:
+ is_baseline = False
+ for k in range(len(matching_variants)):
+ matching_content_list += parse_variant(offset, matching_variants[k], matching_phasing[k], is_baseline)
+
+ return matching_content_list
+
+
+def main():
+ if not os.path.exists(args.output):
+ os.mkdir(args.output)
+
+ match_file_list = args.match_files
+ match_file_num = len(match_file_list)
+ diff_filename_list = []
+ diff_content_table = [[] for i in range(match_file_num)]
+
+ union_pos_set = set()
+
+ for match_file in match_file_list:
+ (pos_donor_dict, pos_content_dict, pos_set) = read_match_file(match_file)
+ #print pos_set
+ pos_donor_dict_list.append(pos_donor_dict)
+ pos_content_dict_list.append(pos_content_dict)
+ union_pos_set.update(pos_set)
+ match_basename = os.path.basename(match_file)
+ diff_filename_list.append(args.output + '/' + match_basename + '.diff')
+
+ union_pos_list = list(union_pos_set)
+ union_pos_list.sort()
+
+ #print union_pos_list
+
+ # think specifically for two VCF files
+ # output detail
+ compare_filename = args.output + '/' + 'compare.html'
+ compare_file = open(compare_filename, 'w')
+ compare_file.write(html_head)
+
+ for pos in union_pos_list:
+ have_diff = False
+ for i in range(match_file_num):
+ if pos not in pos_donor_dict_list[i]:
+ have_diff = True
+ break
+ #print have_diff
+ if not have_diff:
+ donor_benchmark = pos_donor_dict_list[0][pos]
+ for i in range(match_file_num):
+ #print pos, donor_benchmark, pos_donor_dict[i][pos]
+ if pos_donor_dict_list[i][pos] != donor_benchmark:
+ have_diff = True
+ break
+
+ if have_diff:
+ compare_file.write('<div class="box">\n')
+ for i in range(match_file_num):
+ if pos in pos_content_dict_list[i]:
+ diff_content_table[i].append(pos_content_dict_list[i][pos])
+ for content in parse_match(pos_content_dict_list[i][pos], match_file_list[i]):
+ compare_file.write(content)
+ compare_file.write('</div>\n')
+
+ for i in range(match_file_num):
+ with open(diff_filename_list[i], 'w') as diff_file:
+ for diff_content in diff_content_table[i]:
+ diff_file.write(diff_content + '\n')
+
+ compare_file.write(html_tail)
+ compare_file.close()
+
+if __name__ == '__main__':
+ main()
diff --git a/varmatch b/varmatch
new file mode 100755
index 0000000..ab6757f
--- /dev/null
+++ b/varmatch
@@ -0,0 +1,587 @@
+#!/usr/bin/env python
+
+# Copyright 2015, Chen Sun
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+ Authors:
+ Chen Sun(chensun at cse.psu.edu)
+ Paul Medvedev(pashadag at cse.psu.edu)
+"""
+
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+ print (versionError)
+ exit()
+import textwrap as _textwrap
+import multiprocessing
+import argparse
+import os
+import subprocess
+import time
+
+RUN = True
+
+author_email = 'chensun at cse.psu.edu'
+
+class SmartFormatter(argparse.HelpFormatter):
+ def _split_lines(self, text, width):
+ paragraphs = text.split('\n')
+ #return paragraphs
+ multiline_text = []
+ for paragraph in paragraphs:
+ formatted_paragraph = _textwrap.wrap(paragraph, width)
+ multiline_text = multiline_text + formatted_paragraph
+ return multiline_text
+
+ def _fill_text(self, text, width, indent):
+ return ''.join(indent + line for line in text.splitlines(True))
+
+citation = 'Please cite our paper.'
+
+parser = argparse.ArgumentParser(prog="varmatch", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('-b', '--baseline', required=True, metavar='File', help = 'baseline variant VCF filename')
+parser.add_argument('-q', '--query', nargs='+', metavar='File List', help = 'query variant VCF filename')
+parser.add_argument('-g', '--genome', required=True, metavar='File', help= 'genome sequence FASTA filename')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./output')
+
+thread_string = "number of threads, default is the number of available cores (For this machine:" + str(multiprocessing.cpu_count()) + \
+ ")\nIf larger than number of available cores or less than 1, automatically set to default value"
+
+parser.add_argument('-t', '--thread', metavar="INT", help=thread_string, default=str(multiprocessing.cpu_count()))
+
+score_unit_string = "scoring function/score unit: (Default: -1)\n"\
+ "-1 : iterate both 0 and 1.\n"\
+ "0 : the score that a VCF entry contributes is 1.\n"\
+ "1 : the score that a VCF entry contributes is the edit distance between the new allele and the reference one.\n"
+
+match_mode_string = "matching mode: (Default: -1)\n"\
+ "-1 : iterate both 0 and 1.\n"\
+ "0 : a set of query entries match a set of baseline entries if, "\
+ "for each entry, we can select one of the alleles such that the inferred sequences are identical\n"\
+ "1 : a set of query entries match a set of baseline entries if there exist a phasing of each set such that "\
+ "the two inferred haplotypes from the query are equal to the two inferred haplotypes from the baseline.\n"
+
+score_scheme_string = "scoring scheme: (Default: -1)\n"\
+ "-1 : iterate 0, 1, and 2 (excluding 3)\n"\
+ "0 : find two subsets of non-overlapping equivalent variants such that "\
+ "the score of the matched variants is maximized \n"\
+ "1 : find two subsets of non-overlapping equivalent variants such that"\
+ " the score of the chosen baseline variants is maximized\n"\
+ "2 : find a maximum scoring set of variants in the query such that"\
+ " each variant can be matched by a subset of the baseline variants\n"\
+ "3 : (1 to 1 direct match) find a maximum scoring set of entry pairs such that each entry pair contains"\
+ " one query and one baseline variant that result in the same sequence."\
+ " In this scheme, different scoring functions and "\
+ "matching mode have no difference.\n"
+
+parser.add_argument('-u', '--score_unit', help=score_unit_string, metavar='[-1,0,1]', default=-1)
+
+parser.add_argument('-m', '--match_mode', help=match_mode_string, metavar='[-1,0,1]', default=-1)
+
+parser.add_argument('-s', '--score_scheme', help=score_scheme_string, metavar='[-1,0,1,2,3]', default=-1)
+
+parser.add_argument('-G', '--no_graph', help='disable graphic module', action = 'store_true')
+
+disable_curves_string = "disable Precision-Recall curves, if use -G or --no_graph,"\
+ " then automatically disable these curves"
+
+parser.add_argument('-C', '--disable_curves', help=disable_curves_string, action='store_true')
+
+fast_mode_string = "In this mode, automatically disable graphic module and precision-recall curves,"\
+ " only performs one matching criterion.\n"\
+ " Fast mode is equivalent to use following parameters compulsively: -G -u 0 -m 0 -s 0"
+
+parser.add_argument('-f', '--fast_mode', help=fast_mode_string, action='store_true')
+
+args = parser.parse_args()
+
+if(args.fast_mode):
+ args.no_graph = True
+ args.score_unit = 0
+ args.match_mode = 0
+ args.score_scheme = 0
+
+
+def shell_run(command, hide=False):
+ if not RUN:
+ time.sleep(3.5)
+ print(command)
+ else:
+ print(command)
+ if hide: # hide output
+ FNULL = open(os.devnull, 'w')
+ subprocess.call(command, shell=True, stdout=FNULL, stderr=subprocess.STDOUT)
+ # subprocess.call(command, shell=True, stdout=FNULL)
+ FNULL.close()
+ else:
+ subprocess.call(command, shell=True)
+
+
+def check_command(command):
+ """
+ check if corresponding command available
+ """
+ if os.path.isfile(command):
+ return True
+
+ for cmdpath in os.environ['PATH'].split(':'):
+ if os.path.isdir(cmdpath) and command in os.listdir(cmdpath):
+ return True
+ return False
+
+
+def table_2_html(table):
+ html = '<table border=.5>'
+ for i in range(len(table)):
+ if i == 0:
+ html += '<tr><th>' + '</th><th>'.join(table[i]) + '</th></tr>'
+ else:
+ html += '<tr><td>' + '</td><td>'.join(table[i]) + '</td></tr>'
+
+ html += '</table>'
+ return html
+
+html_head = """
+<html>
+<head>
+<style type="text/css">
+table, th, td {
+ border: 1px solid black;
+ border-collapse: collapse;
+}
+th, td {
+ padding: 5px;
+ text-align: left;
+}
+
+.box{
+ border: 4px solid #ccc;
+ padding:20px;
+ margin:10px 100px 100px 10px;
+}
+#advhelp{
+ display:none;
+}
+#advhelp:target{
+ display:block;
+}
+</style>
+</head>
+<body>
+"""
+
+html_tail="""
+</body>
+</html>
+"""
+
+marker_list = ['o', 'v', '1', '8', 's', 'p', '*', 'h', 'x', 'D']
+
+def multiple_compare(baseline_file, query_list, genome_file):
+ global check_compare_command
+ global output_dir
+ if not check_compare_command and not check_command(compare_tool):
+ print ('Error: can not find program: ' + compare_tool)
+ print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+ exit()
+ else:
+ check_compare_command = True
+ compare_command = compare_tool + ' -b ' + baseline_file + ' -g ' + genome_file + ' -o ' + output_dir
+
+ for query_file in query_list:
+ compare_command += ' -q ' + query_file
+
+ if args.thread is not None and int(args.thread) > 0:
+ compare_command += ' -t ' + args.thread
+
+ compare_command += ' -u ' + str(args.score_unit) + ' -m ' + str(args.match_mode) + ' -s ' + str(args.score_scheme)
+
+ if args.no_graph or args.disable_curves:
+ compare_command += ' -C '
+
+ shell_run(compare_command)
+
+
+def varmatch_pairwise(baseline_file, query_file, genome_file):
+ global output_dir
+ ref_basename = os.path.basename(baseline_file)
+ que_basename = os.path.basename(query_file)
+ output_prefix = output_dir + '/' + ref_basename + '_' + que_basename
+ #pairwise_compare(baseline_file, query_file, genome_file)
+ return output_prefix
+
+def create_table_prefx(score_unit, match_mode, score_scheme):
+ matching_id = ''
+ score_unit_string = 'Unit Cost[U]'
+ match_mode_string = 'Genotype[G]'
+ score_scheme_string = 'Total[T]'
+ if(score_unit == '0'):
+ matching_id += 'U'
+ else:
+ matching_id += 'E'
+ score_unit_string = 'Edit Distance[E]'
+
+ if match_mode == '0':
+ matching_id += 'G'
+ else:
+ matching_id += 'V'
+ match_mode_string = 'Variant[V]'
+
+ if score_scheme == '0':
+ matching_id += 'T'
+ elif score_scheme == '1':
+ matching_id += 'B'
+ score_scheme_string = 'Baseline[B]'
+ elif score_scheme == '2':
+ matching_id += 'Q'
+ score_scheme_string = 'Query[Q]'
+
+ return [matching_id, score_unit_string, match_mode_string, score_scheme_string]
+
+
+def parse_stat(output_prefix):
+ global output_dir
+ stat_filename = output_dir + '/' + output_prefix + '.stat'
+ no_filter_table = []
+ head = ['Matching Id', 'Score Unit', 'Match Mode', 'Score Scheme', 'Baseline Match Number', 'Query Match Number', 'Recall(%)', 'Precision(%)']
+ match_id = 0
+ no_filter_table.append(head)
+ x = [] # matching id list
+ y = [] # sensitivity list
+ z = [] # specificity list
+
+ sensitivity_table = []
+ specificity_table = []
+
+ baseline_num = 0.
+ query_num = 0.
+ with open(stat_filename) as stat_file:
+ for line in stat_file.readlines():
+ line = line.strip()
+ if line.startswith('##'):
+ columns = line.split(':')
+ if columns[0] == '##Baseline':
+ baseline_num = float(columns[1])
+ else:
+ query_num = float(columns[1])
+ if line.startswith('#'):
+ continue
+ match_id += 1
+ temp = line.split('\t')
+ row = create_table_prefx(temp[0], temp[1], temp[2])
+ baseline_match_str_list = temp[4].split(',')
+ query_match_str_list = temp[5].split(',')
+ query_total_str_list = temp[6].split(',')
+
+ baseline_match_str = baseline_match_str_list[0]
+ query_match_str = query_match_str_list[0]
+
+ sensitivity_list = []
+ specificity_list = []
+ tn_list = []
+ for baseline_match_num in baseline_match_str_list:
+ sensitivity = float(baseline_match_num) * 100 / baseline_num # this is actually recall
+ sensitivity_list.append(sensitivity)
+ #for query_match_num in query_match_str_list:
+ # specificity = float(query_match_num) * 100 / query_num # this is actually precison
+ # specificity_list.append(specificity)
+ for i in range(len(query_match_str_list)):
+ specificity = float(query_match_str_list[i]) * 100 / float(query_total_str_list[i])
+ specificity_list.append(specificity)
+
+ x.append(row[0])
+ row += [baseline_match_str, query_match_str, "%.3f" % sensitivity_list[0], "%.3f" % specificity_list[0]]
+
+ y.append(sensitivity_list[0])
+ z.append(specificity_list[0])
+
+ sensitivity_table.append(sensitivity_list)
+ specificity_table.append(specificity_list)
+ no_filter_table.append(row)
+
+ return baseline_num, query_num, x, y, z, no_filter_table, sensitivity_table, specificity_table
+
+
+def create_table_by_matchingid_from_by_query(table_list, matching_list, query_number):
+ table_by_matchingid = []
+ for matching_index in range(len(matching_list)):
+ matching_table = []
+ title = ['Query Id', 'Baseline Match Number', 'Query Match Number', 'Recall(%)', 'Precision(%)']
+ matching_table.append(title)
+ for table_index in range(len(table_list)):
+ raw_row = table_list[table_index][matching_index]
+ new_row = ['Query' + str(table_index+1)]
+ new_row += raw_row[4:]
+ matching_table.append(new_row)
+ table_by_matchingid.append(matching_table)
+ return table_by_matchingid
+
+
+# all html and picture are created from stat file, not parameters
+def create_stat_html(query_list, output_prefix_list):
+ global output_dir
+ html_filename = output_dir + '/stat.html'
+ html_file = open(html_filename, 'w')
+ html_file.write(html_head)
+ html_file.write('<h1>VarMatch Report</h1>')
+ html_file.write('<p>precison and recall analysis for each query with variant quality ≥ 0</p>')
+ exp_num = len(output_prefix_list)
+ baseline_num_list = []
+ query_num_list = []
+ table_list = []
+ label_list = []
+ sensitivity_list = []
+ specificity_list = []
+ sensitivity_table_list = []
+ specificity_table_list = []
+
+ for output_prefix in output_prefix_list:
+ (baseline_num, query_num, x, y, z, table, sensitivity_table, specificity_table) = parse_stat(output_prefix)
+ baseline_num_list.append(int(baseline_num))
+ query_num_list.append(int(query_num))
+ label_list.append(x)
+ sensitivity_list.append(y)
+ specificity_list.append(z)
+ table_list.append(table)
+
+ print sensitivity_table
+ print specificity_table
+
+ sensitivity_table_list.append(sensitivity_table)
+ specificity_table_list.append(specificity_table)
+
+ if(len(table_list)) == 0:
+ html_file.close()
+ return
+
+ if not args.no_graph:
+ import numpy as np
+ import matplotlib
+ matplotlib.use('Agg')
+ import matplotlib.pyplot as plt
+ axes = plt.gca()
+ #axes.set_xlim([xmin,xmax])
+ axes.set_ylim([0,100])
+ for i in range(exp_num):
+ marker_id = i % len(marker_list)
+ marker_sign = marker_list[marker_id]
+ label_sign = 'Query ' + str(i+1)
+ x = np.array(range(len(label_list[0])))
+ plt.xticks(x, label_list[0])
+ plt.plot(x, sensitivity_list[i], marker = marker_sign, linestyle = '-', label = label_sign)
+
+ plt.xlabel('Matching Id')
+ plt.ylabel('Recall(%)')
+ #plt.title('Sensitivity of Queries under Different Matching Parameters')
+ plt.legend(loc='best')
+ plt.savefig(output_dir + '/sensitivity.png')
+
+ plt.clf() # clear figure for the next
+ axes = plt.gca()
+ #axes.set_xlim([xmin,xmax])
+ axes.set_ylim([0,100])
+ for i in range(exp_num):
+ marker_id = i % len(marker_list)
+ marker_sign = marker_list[marker_id]
+ label_sign = 'Query ' + str(i+1)
+ x = np.array(range(len(label_list[0])))
+ plt.xticks(x, label_list[0])
+ plt.plot(x, specificity_list[i], marker = marker_sign, linestyle = '-', label = label_sign)
+
+ plt.xlabel('Matching Id')
+ plt.ylabel('Precision(%)')
+ #plt.title('Specificity of Queries under Different Matching Parameters')
+ plt.legend(loc='best')
+ plt.savefig(output_dir + '/specificity.png')
+
+ html_file.write('<h2>VarMatch Matching Parameters Table</h2>'+'\n')
+ parameter_table = []
+ temp_table = table_list[0]
+ for row in temp_table:
+ parameter_table.append(row[:4])
+ html_file.write(table_2_html(parameter_table))
+
+ html_file.write('<h2>Sensitivity and Specificity of Queries under Different Matching Parameters</h2>'+'\n')
+ html_file.write('<p> Baseline File: ' + args.baseline+'</p>' + '\n')
+ for i in range(exp_num):
+ html_file.write('<p> Query ' + str(i+1) + ': ' + query_list[i] + '</p>' + '\n')
+ html_file.write('<h3>Recall of Queries under Different Matching Parameters</h3>'+'\n')
+ html_file.write('<img src="sensitivity.png" alt="Sensitivity Graph Not Found...">'+'\n')
+ html_file.write('<h3>Precison of Queries under Different Matching Parameters</h3>'+'\n')
+ html_file.write('<img src="specificity.png" alt="Specificity Graph Not Found...">'+'\n')
+
+ # sensitivity and specificity analysis by query
+ html_file.write('<h2>Sensitivity and Specificity Analysis by Query</h2>'+'\n')
+
+ for i in range(exp_num):
+ html_file.write('<div class="box">')
+ html_file.write('<h3>Query File: ' + query_list[i] + '</h3>'+'\n')
+ html_file.write('<p> Number of Variants in Baseline: ' + str(baseline_num_list[i]) + '</p>'+'\n')
+ html_file.write('<p> Number of Variants in Query: ' + str(query_num_list[i]) + '</p>'+'\n')
+ html_file.write(table_2_html(table_list[i]))
+ html_file.write('</div>'+'\n')
+
+ if exp_num > 1:
+ # sensitivity and specificity analysis by matching id
+ html_file.write('<h2><Sensitivity and Specificity Analysis by Matching Id/h2>')
+
+ table_by_matchingid = create_table_by_matchingid_from_by_query(table_list, label_list[0], exp_num)
+ for i in range(len(label_list[0])):
+ html_file.write('<div class="box">')
+ html_file.write('<h3>Matching Id: ' + label_list[0][i] + '</h3>'+'\n')
+ html_file.write(table_2_html(table_by_matchingid[i]))
+ html_file.write('</div>'+'\n')
+
+ html_file.write(html_tail)
+ html_file.close()
+
+ # create roc html
+ if args.no_graph or args.disable_curves:
+ return
+
+ html_filename = output_dir + '/precision_recall.html'
+ html_file = open(html_filename, 'w')
+ html_file.write(html_head)
+ html_file.write('<h1>VarMatch Precision-Recall Curves</h1>')
+
+ html_file.write('<h2>VarMatch Matching Parameters Table</h2>'+'\n')
+ parameter_table = []
+ temp_table = table_list[0]
+ for row in temp_table:
+ parameter_table.append(row[:4])
+ html_file.write(table_2_html(parameter_table))
+
+ for i in range(exp_num):
+ html_file.write('<p>Query ' + str(i+1) + ': ' + query_list[i] + '</p>' + '\n')
+
+ html_file.write('<h2>Precision-Recall Curve by Matching Id</h2>')
+ html_file.write('<p>For each matching id, compare all queries in one graph</p>')
+ for i in range(len(parameter_table)-1):
+ html_file.write('<h3>Precision-Recall Curve for Parameter '+parameter_table[i+1][0]+'</h3>'+'\n')
+
+ plt.clf()
+
+ for j in range(exp_num):
+ x = sensitivity_table_list[j][i]
+ y = specificity_table_list[j][i]
+
+ x[:] = [a/100 for a in x]
+ #y.reverse()
+ y[:] = [a/100 for a in y]
+ x.sort()
+ y.sort(reverse=True)
+ y = y[::-1]
+ label_sign = 'Query ' + str(j+1)
+ plt.plot(x,y, label = label_sign)
+
+ #x = [0.0, 1.0]
+ #plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')
+
+ plt.xlim(0.0, 1.0)
+ plt.ylim(0.0, 1.0)
+ plt.xlabel('Recall')
+ plt.ylabel('Precision')
+ plt.legend(loc='best')
+ plt.tight_layout()
+ plt.savefig(output_dir + '/parameter' + str(i)+'.roc.png')
+
+ html_file.write('<img src="parameter'+str(i) + '.roc.png'+'" alt="ROC Curve Not Found">\n')
+
+ html_file.write('<h2>Precision-Recall Curve by Query</h2>')
+ html_file.write('<p>For each query, compare all matching id in one graph</p>')
+ for i in range(exp_num):
+ html_file.write('<h3>Precision-Recall Curve for Query '+str(i+1)+'</h3>'+'\n')
+
+ plt.clf()
+
+ colormap = plt.cm.gist_ncar
+ plt.gca().set_color_cycle([colormap(k) for k in np.linspace(0, 0.9, len(parameter_table))])
+
+ for j in range(len(parameter_table)-1):
+ x = sensitivity_table_list[i][j]
+ y = specificity_table_list[i][j]
+
+ #x[:] = [1.0 - a/100 for a in x]
+ #y.reverse()
+ #y[:] = [a/100 for a in y]
+
+ #x.sort()
+ #y.sort(reverse=True)
+
+ label_sign = parameter_table[j+1][0]
+ plt.plot(x,y, label = label_sign)
+
+ #x = [0.0, 1.0]
+ #plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')
+
+ plt.xlim(0.0, 1.0)
+ plt.ylim(0.0, 1.0)
+ plt.xlabel('Recall')
+ plt.ylabel('Precision')
+ plt.legend(loc='best')
+ plt.tight_layout()
+ plt.savefig(output_dir + '/query' + str(i)+'.roc.png')
+
+ html_file.write('<img src="query'+str(i) + '.roc.png'+'" alt="ROC Curve Not Found">\n')
+
+ html_file.write(html_tail)
+ html_file.close()
+
+
+
+def main():
+ if len(sys.argv) < 2:
+ parser.print_help()
+ exit()
+
+ global check_compare_command
+ global script_path
+ global compare_tool
+ global output_dir
+ global temp_dir
+
+ check_compare_command = True
+
+ script_path = sys.path[0]
+ compare_tool = script_path + '/vm-core'
+ output_dir = ''
+ temp_dir = ''
+
+ # create output directory
+ if args.output is None or args.output == '':
+ output_dir = os.getcwd() + '/output'
+ else:
+ output_dir = args.output
+ if output_dir == '':
+ output_dir = os.getcwd() + '/output'
+ if not os.path.exists(output_dir):
+ os.mkdir(output_dir)
+
+ temp_dir = output_dir + '/temp'
+
+ query_list = args.query
+
+ multiple_compare(args.baseline, query_list, args.genome)
+
+ output_prefix_list = []
+ for i in range(len(query_list)):
+ output_prefix_list.append('query'+str(i+1))
+
+ create_stat_html(query_list, output_prefix_list)
+
+if __name__ == '__main__':
+ main()
diff --git a/xx.png b/xx.png
new file mode 100644
index 0000000..29fc8fa
Binary files /dev/null and b/xx.png differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/varmatch.git
More information about the debian-med-commit
mailing list