[med-svn] [varmatch] 01/02: Imported Upstream version 0+20160708+dfsg

Afif Elghraoui afif at moszumanska.debian.org
Tue Jul 12 08:22:03 UTC 2016

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository varmatch.

commit 81343a60519e37e9568e7522ead85a890c5f01bd
Author: Afif Elghraoui <afif at ghraoui.name>
Date:   Tue Jul 12 00:46:29 2016 -0700

    Imported Upstream version 0+20160708+dfsg
 .gitmodules                         |    3 +
 README.md                           |  132 ++
 drawsth.py                          |    9 +
 examples/chromosome_list.txt        |    5 +
 filter                              |  191 ++
 lib/__init__.py                     |    0
 lib/binary_search_tree.py           |  445 +++++
 lib/binary_tree.py                  |   74 +
 lib/linked_binary_tree.py           |  196 ++
 lib/linked_queue.py                 |   77 +
 lib/map_base.py                     |   38 +
 lib/red_black_tree.py               |  112 ++
 lib/tree.py                         |  151 ++
 license.txt                         |  674 +++++++
 makefile                            |   12 +
 purify                              |   71 +
 py/lib/__init__.py                  |    0
 py/lib/binary_search_tree.py        |  445 +++++
 py/lib/binary_tree.py               |   74 +
 py/lib/linked_binary_tree.py        |  196 ++
 py/lib/linked_queue.py              |   77 +
 py/lib/map_base.py                  |   38 +
 py/lib/red_black_tree.py            |  112 ++
 py/lib/tree.py                      |  151 ++
 py/vcfcompare.py                    | 1098 +++++++++++
 py/vcfcompare_backup.py             |  677 +++++++
 script/add_marker.py                |    0
 script/compare_match.py             |   44 +
 script/count_decomposed_matching.py |   28 +
 script/direct_match.py              |   32 +
 script/filter_hc.py                 |  120 ++
 script/filter_lcr.py                |  120 ++
 script/overlap.py                   |  147 ++
 script/overlap_direct.py            |  138 ++
 script/varmatch                     |  484 +++++
 src/diploid.cpp                     | 3562 +++++++++++++++++++++++++++++++++++
 src/diploid.h                       |  342 ++++
 src/diploidvariant.h                |  117 ++
 src/filter_cv.cpp                   |  245 +++
 src/filter_hc.cpp                   |  158 ++
 src/makefile                        |   20 +
 src/removeduplicate.cpp             |  456 +++++
 src/removeduplicate.h               |   31 +
 src/splitvcf.cpp                    |   30 +
 src/splitvcf.h                      |   15 +
 src/test.py                         |    1 +
 src/threadguard.cpp                 |    9 +
 src/threadguard.h                   |   17 +
 src/util.cpp                        |   20 +
 src/util.h                          |   54 +
 src/vcf.cpp                         | 1230 ++++++++++++
 src/vcf.h                           |  210 +++
 src/vm.cpp                          |  233 +++
 src/wholegenome.cpp                 | 3341 ++++++++++++++++++++++++++++++++
 src/wholegenome.h                   |  367 ++++
 src/wholegenome_backup.cpp          | 2056 ++++++++++++++++++++
 src/wholegenome_backup.h            |  274 +++
 src/wholegenome_working.cpp         | 2471 ++++++++++++++++++++++++
 src/wholegenome_working.h           |  292 +++
 stat                                |   19 +
 vardiff                             |  299 +++
 varmatch                            |  587 ++++++
 xx.png                              |  Bin 0 -> 27349 bytes
 63 files changed, 22627 insertions(+)

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..37dabe4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vt"]
+	path = vt
+	url = https://github.com/atks/vt.git
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8eebf9a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,132 @@
+# VarMatch
+robust matching of small variant datasets using flexible scoring schemes
+# Authors
+- Chen Sun (The Pennsylvania State University)
+- Paul Medvedev (The Pennsylvania State University)
+# Release Date
+### TBA
+Any questions about VarMatch, please email to chensun at cse dot psu dot edu.
+If you identify a bug in VarMatch, please either reported on 'github Issues' of VarMatch, or email directly to chensun at cse dot psu dot edu.
+# Prerequisite
+- GCC 4.7 or later for c++11 support
+- Python 2.7 or later
+- matplotlib*
+> *matplotlib is only used for graphic visualization. you can use '-G' parameter to disable visualization function
+> *matplotlib is not a prerequisite if either `-f`, `-G` or `-C` parameter is used
+# Installation
+**Quick Install Instruction:**
+You can build VarMatch from source. 
+git clone https://github.com/medvedevgroup/varmatch.git
+cd varmatch
+make all
+# Usage
+### Quick Usage:
+*compare two vcf files to match variants*
+./varmatch -b baseline.vcf -q query.vcf -g ref.fa -o out -f
+- `-b` baseline vcf file
+- `-q` query vcf file
+- `-g` genome fasta file
+- `-o` output file prefix, default value is `out`
+- `-f` fast mode*, equivalent to use parameters `-u 0 -m 0 -s 0 -C`
+>*fast mode is suggested for ordinary analysis
+### Detail Usage
+./varmatch  -g <file> -b <file> -q <file> [-o <string>] [-t <int>] [-u <0|1>]
+     [-m <0|1>] [-s <0|1|2|3>] [-h] [-G] [-C] [-f]
+   `-g` <file>,  `--genome_sequence` <file>
+     (required)  genome sequence FASTA filename
+   `-b` <file>,  `--baseline` <file>
+     (required)  baseline variant VCF filename
+   `-q` <file>,  `--query` <file>
+     (required)  query variant VCF filename
+   `-o` <string>,  `--output_prefix` <string>
+     output filename prefix, default is "out"
+   `-t` <int>,  `--thread_num` <int>
+     number of threads, default is the number of available cores.
+     If larger than number of available cores or less than 1, automatically
+     set to default value
+   `-u` <0|1>,  `--score_unit` <0|1>
+     scoring function/score unit: (Default: 0)
+     0 : the score that a VCF entry contributes is 1.
+     1 : the score that a VCF entry contributes is the edit distance
+     between the new allele and the reference one.
+   `-m` <0|1>,  `--match_mode` <0|1>
+     matching mode: (Default: 0)
+     0 : a set of query entries match a set of baseline entries if, for
+     each entry, we can select one of the alleles such that the inferred
+     sequences are identical
+     1 : a set of query entries match a set of baseline entries if there
+     exist a phasing of each set such that the two inferred haplotypes from
+     the query are equal to the two inferred haplotypes from the
+     baseline.
+   `-s` <0|1|2|3>,  `--score_scheme` <0|1|2|3>
+     scoring scheme: (Default: 0)
+     0 : find two subsets of non-overlapping equivalent variants such that
+     the score of the matched variants is maximized (Default)
+     1 : find two subsets of non-overlapping equivalent variants such that
+     the score of the chosen baseline variants is maximized
+     2 : find a maximum scoring set of variants in the query such that each
+     variant can be matched by a subset of the baseline variants
+     3 : (1 to 1 direct match) find a maximum scoring set of entry pairs
+     such that each entry pair contains one query and one baseline variant
+     that result in the same sequence. In this scheme, different scoring
+     functions and matching mode have no difference.
+    `-G`, `--no_graph`
+          disable graphic module
+    `-C`, `--disable_curves`
+          disable Precision-Recall curves, if use -G or --no_graph, then
+          automatically disable these curves
+    `-f`, `--fast_mode`
+          In this mode, automatically disable graphic module and precision-
+          recall curves, only performs one matching criterion.
+           Fast mode is equivalent to use following parameters compulsively: -G
+          -u 0 -m 0 -s 0
+### Help Information:
+use `-h/--help` for detailed help message.
diff --git a/drawsth.py b/drawsth.py
new file mode 100644
index 0000000..c4c2298
--- /dev/null
+++ b/drawsth.py
@@ -0,0 +1,9 @@
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+t = np.arange(0., 5., 0.2)
+plt.plot(t,t/2, 'g-^')
diff --git a/examples/chromosome_list.txt b/examples/chromosome_list.txt
new file mode 100644
index 0000000..6484ba2
--- /dev/null
+++ b/examples/chromosome_list.txt
@@ -0,0 +1,5 @@
+1   /home/varmatch/human/chr1.fa
+2   /home/varmatch/human/chr2.fa
+17  /home/varmatch/human/backup/chr17.fa
+X   /home/varmatch/human/chrxx.fa
+Y   /home/anotherpath/human/chrY/human.y.fa
\ No newline at end of file
diff --git a/filter b/filter
new file mode 100755
index 0000000..15f5a3f
--- /dev/null
+++ b/filter
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+from sys import argv
+import argparse
+import math
+import scipy.stats as stats
+citation = 'Please cite our paper'
+parser = argparse.ArgumentParser(epilog=citation)
+parser.add_argument('--qu', metavar='N', help='quality number(QUAL) threshold >= N (default: N=30)', default=30)
+parser.add_argument('--ab', metavar='N', help='allele balance(AB) threshold <= N%% (default: N=20)', default=20)
+parser.add_argument('--fs', metavar='N', help='Fisher strand P-vale <= N (default: N=0.001)', default=0.001)
+parser.add_argument('--rd', metavar='N', default=65,
+                    help="average read depth=N, maximum read depth(MD) threshold >= N+4*sqrt(N) (default: N=65),"
+                         " use --rd 0 to disable MD filter")
+parser.add_argument('-i', metavar='input.vcf', help='input VCF file')
+parser.add_argument('-o', metavar='output.vcf', help='output VCF file name(default: output.vcf)', default='output.vcf')
+parser.add_argument('--homo', action='store_true', help='filter out homozygous variants')
+parser.add_argument('--nf', action='store_true', help="no filters used in Heng Li review")
+parser.add_argument('--snp', action='store_true', help="only want SNPs")
+parser.add_argument('--indel', action='store_true', help='only want INDELs')
+args = parser.parse_args()
+def main():
+    if len(argv) < 2:
+        parser.print_help()
+        exit()
+    filter_homo = args.homo
+    if not filter_homo:
+        print ('Warning: compulsively filter out homozygous variants :)')
+        filter_homo = True
+    md = 0  # maximum depth filter
+    if args.rd != 0:
+        md = args.rd + 4 * math.sqrt(args.rd)
+    else:
+        print ('Warning: maximum depth(MD) filter is disabled because read depth = 0')
+    output_file = open(args.o, 'w')
+    with open(args.i) as input_file:
+        for line in input_file.readlines():
+            qu_fail = False
+            ab_fail = False
+            fs_fail = False
+            md_fail = False
+            if line.startswith('#'):
+                output_file.write(line)
+                continue
+            columns = line.split('\t')
+            if len(columns) < 8:
+                print ('Warning: current variant does not contains enough info for filtering')
+                continue
+            ab_contain = False
+            ab_pass = True
+            two_alleles = False
+            pv = 1.0
+            rd = -1
+            srf = -1
+            srr = -1
+            saf_list = []
+            sar_list = []
+            alt = columns[4]
+            if ',' in alt:
+                two_alleles = True
+            ref = columns[3]
+            is_indel = False
+            for a in alt.split(','):
+                if len(ref) != len(a):
+                    is_indel = True
+            if args.snp and is_indel:
+                continue
+            if args.indel and not is_indel:
+                continue
+            # Filter out homozygous
+            if filter_homo:
+                if len(columns) < 10:
+                    print('Warning: variant does not contain enough info to filter homozygous variants')
+                format_col = columns[8].split(':')
+                gt_index = -1
+                for i in range(len(format_col)):
+                    if format_col[i] == 'GT':
+                        gt_index = i
+                if gt_index == -1:
+                    print ('Warning: variant does not contain genotype info')
+                    continue
+                val_col = columns[9].split(':')
+                gt_val = val_col[gt_index]
+                gt_col = []
+                if '/' in gt_val:
+                    gt_col = gt_val.split('/')
+                elif '|' in gt_val:
+                    gt_col = gt_val.split('|')
+                else:
+                    print ('Warning: unrecognized genotype info')
+                    continue
+                if gt_col[0] == gt_col[1]:
+                    continue
+            if args.nf:
+                output_file.write(line)
+                continue
+            quality_num = float(columns[5])
+            # quality filter(QU)
+            if quality_num < args.qu:
+                qu_fail = True
+            if not qu_fail:
+                output_file.write(line)
+                continue
+            info_col = columns[7].split(';')
+            for info in info_col:
+                val_col = info.split('=')
+                info_name = val_col[0]
+                info_val = val_col[1]
+                if info_name == 'AB':
+                    ab_contain = True
+                    if two_alleles:
+                        ab_col = info_val.split(',')
+                        for ab in ab_col:
+                            if float(ab) > args.ab * 0.01:
+                                ab_pass = False
+                    else:
+                        if float(info_val) > args.ab * 0.01:
+                            ab_pass = False
+                elif info_name == 'DP':
+                    rd = int(info_val)
+                elif info_name == 'SRF':
+                    srf = int(info_val)
+                elif info_name == 'SRR':
+                    srr = int(info_val)
+                elif info_name == 'SAF':
+                    if two_alleles:
+                        temp_list = info_val.split(',')
+                        saf_list = [int(temp_list[0]), int(temp_list[1])]
+                    else:
+                        saf_list = [int(info_val)]
+                elif info_name == 'SAR':
+                    if two_alleles:
+                        temp_list = info_val.split(',')
+                        sar_list = [int(temp_list[0]), int(temp_list[1])]
+                    else:
+                        sar_list = [int(info_val)]
+            # AB filter
+            if not ab_contain or not ab_pass:
+                ab_fail = True
+            if not ab_fail:
+                output_file.write(line)
+                continue
+            # Maximum depth(MD) filter
+            if rd == -1:
+                print ('Warning: current variant does not contain read depth info')
+                continue
+            elif rd < md:
+                md_fail = True
+            if not md_fail:
+                output_file.write(line)
+                continue
+            # Fisher strand filter(FS)
+            oddsratio, pv = stats.fisher_exact([[srf, srr], [saf_list[0], sar_list[0]]])
+            if pv > args.fs:
+                fs_fail = True
+            if two_alleles:
+                oddsratio, pv = stats.fisher_exact([[srf, srr], [saf_list[1], sar_list[1]]])
+                if pv > args.fs:
+                    fs_fail = True
+            if not fs_fail:
+                output_file.write(line)
+                continue
+    output_file.close()
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/lib/__init__.py b/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/lib/binary_search_tree.py b/lib/binary_search_tree.py
new file mode 100644
index 0000000..d5a4c17
--- /dev/null
+++ b/lib/binary_search_tree.py
@@ -0,0 +1,445 @@
+# Copyright 2015, Chen Sun
+# Based on source code copyright by 2013, Michael H. Goldwasser
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.linked_binary_tree import LinkedBinaryTree
+from lib.map_base import MapBase
+import copy
+class TreeMap(LinkedBinaryTree, MapBase):
+  """Sorted map implementation using a binary search tree."""
+  #---------------------------- override Position class ----------------------------
+  class Position(LinkedBinaryTree.Position):
+    def key(self):
+      """Return key of map's key-value pair."""
+      return self.element()._key
+    def value(self):
+      """Return value of map's key-value pair."""
+      return self.element()._value
+  #------------------------------- nonpublic utilities -------------------------------
+  def _subtree_search(self, p, k):
+    """Return Position of p's subtree having key k, or last node searched."""
+    #print(k)
+    if k == p.key():                                   # found match
+      return p                                         
+    elif k < p.key():                                  # search left subtree
+      if self.left(p) is not None:
+        return self._subtree_search(self.left(p), k)   
+    else:                                              # search right subtree
+      if self.right(p) is not None:
+        return self._subtree_search(self.right(p), k)
+    return p                                           # unsuccessful search
+  #create a subtree_search help function
+  def _search_trace(self, p, k):
+    """Return all the Position that has been searched."""
+    yield p
+    while p is not None and k != p.key():
+      if k < p.key():
+        p = self.left(p)
+        yield p
+      else:
+        p = self.right(p)
+        yield p
+  def _subtree_first_position(self, p):
+    """Return Position of first item in subtree rooted at p."""
+    walk = p
+    while self.left(walk) is not None:                 # keep walking left
+      walk = self.left(walk)
+    return walk
+  def _subtree_last_position(self, p):
+    """Return Position of last item in subtree rooted at p."""
+    walk = p
+    while self.right(walk) is not None:                # keep walking right
+      walk = self.right(walk)
+    return walk
+  #--------------------- public methods providing "positional" support ---------------------
+  def first(self):
+    """Return the first Position in the tree (or None if empty)."""
+    return self._subtree_first_position(self.root()) if len(self) > 0 else None
+  def last(self):
+    """Return the last Position in the tree (or None if empty)."""
+    return self._subtree_last_position(self.root()) if len(self) > 0 else None
+  def before(self, p):
+    """Return the Position just before p in the natural order.
+    Return None if p is the first position.
+    """
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.left(p):
+      return self._subtree_last_position(self.left(p))
+    else:
+      # walk upward
+      walk = p
+      above = self.parent(walk)
+      while above is not None and walk == self.left(above):
+        walk = above
+        above = self.parent(walk)
+      return above
+  def after(self, p):
+    """Return the Position just after p in the natural order.
+    Return None if p is the last position.
+    """
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.right(p):
+      return self._subtree_first_position(self.right(p))
+    else:
+      walk = p
+      above = self.parent(walk)
+      while above is not None and walk == self.right(above):
+        walk = above
+        above = self.parent(walk)
+      return above
+  def find_position(self, k):
+    """Return position with key k, or else neighbor (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self._subtree_search(self.root(), k)
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+      return p
+  def find_nearest(self, k):
+    """Return position with key k, or else the nearest position k' (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          #print(p.key(), abs(p.key()-k), shortest_distance)
+          abs_distance = abs(p.key() - k)
+          if abs_distance < shortest_distance:
+            shortest_distance = abs_distance
+            nearest_p = p
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def find_nearest_small(self, k):
+    """Return position with key k, or else the nearest position with k' < k (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          distance = k - p.key()
+          if distance >= 0 and distance < shortest_distance:
+            shortest_distance = distance
+            nearest_p = p     
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def find_nearest_large(self, k):
+    """Return position with key k, or else the nearest position with k' > k (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          distance = p.key()-k
+          if distance >= 0 and distance < shortest_distance:
+            shortest_distance = distance
+            nearest_p = p     
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def delete(self, p):
+    """Remove the item at given Position."""
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.left(p) and self.right(p):           # p has two children
+      replacement = self._subtree_last_position(self.left(p))
+      self._replace(p, replacement.element())    # from LinkedBinaryTree
+      p =  replacement
+    # now p has at most one child
+    parent = self.parent(p)
+    self._delete(p)                              # inherited from LinkedBinaryTree
+    self._rebalance_delete(parent)               # if root deleted, parent is None
+  def keys(self):
+    key_list = []
+    p = self.first()
+    while p is not None:
+      key_list.append(p.key())
+      p = self.after(p)
+    return key_list
+  #--------------------- public methods for (standard) map interface ---------------------
+  def __getitem__(self, k):
+    """Return value associated with key k (raise KeyError if not found)."""
+    if self.is_empty():
+      raise KeyError('Key Error: ' + repr(k))
+    else:
+      p = self._subtree_search(self.root(), k)
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+      if k != p.key():
+        raise KeyError('Key Error: ' + repr(k))
+      return p.value()
+  def __setitem__(self, k, v):
+    """Assign value v to key k, overwriting existing value if present."""
+    if self.is_empty():
+      leaf = self._add_root(self._Item(k,v))     # from LinkedBinaryTree
+    else:
+      p = self._subtree_search(self.root(), k)
+      if p.key() == k:
+        p.element()._value = v                   # replace existing item's value
+        self._rebalance_access(p)                # hook for balanced tree subclasses
+        return
+      else:
+        item = self._Item(k,v)
+        if p.key() < k:
+          leaf = self._add_right(p, item)        # inherited from LinkedBinaryTree
+        else:
+          leaf = self._add_left(p, item)         # inherited from LinkedBinaryTree
+    self._rebalance_insert(leaf)                 # hook for balanced tree subclasses
+  def __delitem__(self, k):
+    """Remove item associated with key k (raise KeyError if not found)."""
+    if not self.is_empty():
+      p = self._subtree_search(self.root(), k)
+      if k == p.key():
+        self.delete(p)                           # rely on positional version
+        return                                   # successful deletion complete
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+    raise KeyError('Key Error: ' + repr(k))
+  def __iter__(self):
+    """Generate an iteration of all keys in the map in order."""
+    p = self.first()
+    while p is not None:
+      yield p.key()
+      p = self.after(p)
+  #--------------------- public methods for sorted map interface ---------------------
+  def __reversed__(self):
+    """Generate an iteration of all keys in the map in reverse order."""
+    p = self.last()
+    while p is not None:
+      yield p.key()
+      p = self.before(p)
+  def find_min(self):
+    """Return (key,value) pair with minimum key (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self.first()
+      return (p.key(), p.value())
+  def find_max(self):
+    """Return (key,value) pair with maximum key (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self.last()
+      return (p.key(), p.value())
+  def find_le(self, k):
+    """Return (key,value) pair with greatest key less than or equal to k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if k < p.key():
+        p = self.before(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_lt(self, k):
+    """Return (key,value) pair with greatest key strictly less than k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if not p.key() < k:
+        p = self.before(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_ge(self, k):
+    """Return (key,value) pair with least key greater than or equal to k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)                   # may not find exact match
+      if p.key() < k:                             # p's key is too small
+        p = self.after(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_gt(self, k):
+    """Return (key,value) pair with least key strictly greater than k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if not k < p.key():                   
+        p = self.after(p)
+      return (p.key(), p.value()) if p is not None else None
+  def linear_range_search(self, position, start, stop):
+    """
+    Iterate all position such that start < position.key < stop
+    Mind: linear_search function only return Position, not key value pair.
+    If start is None, searching begins from self.first()
+    If start is None, iteration begins with minimum key of map.
+    If end is None, iteration continues through the maximum key of map.
+    """  
+    if not self.is_empty():
+      if position is not None:
+        p = position
+      else:
+        p = self.first()
+      while p is not None and (stop is None or p.key() < stop):
+        if p.key() >= start:
+          yield(p)
+        p = self.after(p)
+  def find_range(self, start, stop):
+    """Iterate all (key,value) pairs such that start <= key < stop.
+    If start is None, iteration begins with minimum key of map.
+    If stop is None, iteration continues through the maximum key of map.
+    """
+    if not self.is_empty():
+      if start is None:
+        p = self.first()
+      else:
+        # we initialize p with logic similar to find_ge
+        p = self.find_position(start)
+        if p.key() < start:
+          p = self.after(p)
+      while p is not None and (stop is None or p.key() < stop):
+        yield (p.key(), p.value())
+        p = self.after(p)
+  #--------------------- hooks used by subclasses to balance a tree ---------------------
+  def _rebalance_insert(self, p):
+    """Call to indicate that position p is newly added."""
+    pass
+  def _rebalance_delete(self, p):
+    """Call to indicate that a child of p has been removed."""
+    pass
+  def _rebalance_access(self, p):
+    """Call to indicate that position p was recently accessed."""
+    pass
+  #--------------------- nonpublic methods to support tree balancing ---------------------
+  def _relink(self, parent, child, make_left_child):
+    """Relink parent node with child node (we allow child to be None)."""
+    if make_left_child:                           # make it a left child
+      parent._left = child
+    else:                                         # make it a right child
+      parent._right = child
+    if child is not None:                         # make child point to parent
+      child._parent = parent
+  def _rotate(self, p):
+    """Rotate Position p above its parent.
+    Switches between these configurations, depending on whether p==a or p==b.
+          b                  a
+         / \                /  \
+        a  t2             t0   b
+       / \                     / \
+      t0  t1                  t1  t2
+    Caller should ensure that p is not the root.
+    """
+    """Rotate Position p above its parent."""
+    x = p._node
+    y = x._parent                                 # we assume this exists
+    z = y._parent                                 # grandparent (possibly None)
+    if z is None:            
+      self._root = x                              # x becomes root
+      x._parent = None        
+    else:
+      self._relink(z, x, y == z._left)            # x becomes a direct child of z
+    # now rotate x and y, including transfer of middle subtree
+    if x == y._left:
+      self._relink(y, x._right, True)             # x._right becomes left child of y
+      self._relink(x, y, False)                   # y becomes right child of x
+    else:
+      self._relink(y, x._left, False)             # x._left becomes right child of y
+      self._relink(x, y, True)                    # y becomes left child of x
+  def _restructure(self, x):
+    """Perform a trinode restructure among Position x, its parent, and its grandparent.
+    Return the Position that becomes root of the restructured subtree.
+    Assumes the nodes are in one of the following configurations:
+        z=a                 z=c           z=a               z=c  
+       /  \                /  \          /  \              /  \  
+      t0  y=b             y=b  t3       t0   y=c          y=a  t3 
+         /  \            /  \               /  \         /  \     
+        t1  x=c         x=a  t2            x=b  t3      t0   x=b    
+           /  \        /  \               /  \              /  \    
+          t2  t3      t0  t1             t1  t2            t1  t2   
+    The subtree will be restructured so that the node with key b becomes its root.
+              b
+            /   \
+          a       c
+         / \     / \
+        t0  t1  t2  t3
+    Caller should ensure that x has a grandparent.
+    """
+    """Perform trinode restructure of Position x with parent/grandparent."""
+    y = self.parent(x)
+    z = self.parent(y)
+    if (x == self.right(y)) == (y == self.right(z)):  # matching alignments
+      self._rotate(y)                                 # single rotation (of y)
+      return y                                        # y is new subtree root
+    else:                                             # opposite alignments
+      self._rotate(x)                                 # double rotation (of x)     
+      self._rotate(x)
+      return x                                        # x is new subtree root
diff --git a/lib/binary_tree.py b/lib/binary_tree.py
new file mode 100644
index 0000000..56ae9e7
--- /dev/null
+++ b/lib/binary_tree.py
@@ -0,0 +1,74 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.tree import Tree
+class BinaryTree(Tree):
+  """Abstract base class representing a binary tree structure."""
+  # --------------------- additional abstract methods ---------------------
+  def left(self, p):
+    """Return a Position representing p's left child.
+    Return None if p does not have a left child.
+    """
+    raise NotImplementedError('must be implemented by subclass')
+  def right(self, p):
+    """Return a Position representing p's right child.
+    Return None if p does not have a right child.
+    """
+    raise NotImplementedError('must be implemented by subclass')
+  # ---------- concrete methods implemented in this class ----------
+  def sibling(self, p):
+    """Return a Position representing p's sibling (or None if no sibling)."""
+    parent = self.parent(p)
+    if parent is None:                    # p must be the root
+      return None                         # root has no sibling
+    else:
+      if p == self.left(parent):
+        return self.right(parent)         # possibly None
+      else:
+        return self.left(parent)          # possibly None
+  def children(self, p):
+    """Generate an iteration of Positions representing p's children."""
+    if self.left(p) is not None:
+      yield self.left(p)
+    if self.right(p) is not None:
+      yield self.right(p)
+  def inorder(self):
+    """Generate an inorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_inorder(self.root()):
+        yield p
+  def _subtree_inorder(self, p):
+    """Generate an inorder iteration of positions in subtree rooted at p."""
+    if self.left(p) is not None:          # if left child exists, traverse its subtree
+      for other in self._subtree_inorder(self.left(p)):
+        yield other
+    yield p                               # visit p between its subtrees
+    if self.right(p) is not None:         # if right child exists, traverse its subtree
+      for other in self._subtree_inorder(self.right(p)):
+        yield other
+  # override inherited version to make inorder the default
+  def positions(self):
+    """Generate an iteration of the tree's positions."""
+    return self.inorder()                 # make inorder the default
diff --git a/lib/linked_binary_tree.py b/lib/linked_binary_tree.py
new file mode 100644
index 0000000..a6cc58b
--- /dev/null
+++ b/lib/linked_binary_tree.py
@@ -0,0 +1,196 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.binary_tree import BinaryTree
+class LinkedBinaryTree(BinaryTree):
+  """Linked representation of a binary tree structure."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node:
+    """Lightweight, nonpublic class for storing a node."""
+    __slots__ = '_element', '_parent', '_left', '_right' # streamline memory usage
+    def __init__(self, element, parent=None, left=None, right=None):
+      self._element = element
+      self._parent = parent
+      self._left = left
+      self._right = right
+  #-------------------------- nested Position class --------------------------
+  class Position(BinaryTree.Position):
+    """An abstraction representing the location of a single element."""
+    def __init__(self, container, node):
+      """Constructor should not be invoked by user."""
+      self._container = container
+      self._node = node
+    def element(self):
+      """Return the element stored at this Position."""
+      return self._node._element
+    def __eq__(self, other):
+      """Return True if other is a Position representing the same location."""
+      return type(other) is type(self) and other._node is self._node
+  #------------------------------- utility methods -------------------------------
+  def _validate(self, p):
+    """Return associated node, if position is valid."""
+    if not isinstance(p, self.Position):
+      raise TypeError('p must be proper Position type')
+    if p._container is not self:
+      raise ValueError('p does not belong to this container')
+    if p._node._parent is p._node:      # convention for deprecated nodes
+      raise ValueError('p is no longer valid')
+    return p._node
+  def _make_position(self, node):
+    """Return Position instance for given node (or None if no node)."""
+    return self.Position(self, node) if node is not None else None
+  #-------------------------- binary tree constructor --------------------------
+  def __init__(self):
+    """Create an initially empty binary tree."""
+    self._root = None
+    self._size = 0
+  #-------------------------- public accessors --------------------------
+  def __len__(self):
+    """Return the total number of elements in the tree."""
+    return self._size
+  def root(self):
+    """Return the root Position of the tree (or None if tree is empty)."""
+    return self._make_position(self._root)
+  def parent(self, p):
+    """Return the Position of p's parent (or None if p is root)."""
+    node = self._validate(p)
+    return self._make_position(node._parent)
+  def left(self, p):
+    """Return the Position of p's left child (or None if no left child)."""
+    node = self._validate(p)
+    return self._make_position(node._left)
+  def right(self, p):
+    """Return the Position of p's right child (or None if no right child)."""
+    node = self._validate(p)
+    return self._make_position(node._right)
+  def num_children(self, p):
+    """Return the number of children of Position p."""
+    node = self._validate(p)
+    count = 0
+    if node._left is not None:     # left child exists
+      count += 1
+    if node._right is not None:    # right child exists
+      count += 1
+    return count
+  #-------------------------- nonpublic mutators --------------------------
+  def _add_root(self, e):
+    """Place element e at the root of an empty tree and return new Position.
+    Raise ValueError if tree nonempty.
+    """
+    if self._root is not None:
+      raise ValueError('Root exists')
+    self._size = 1
+    self._root = self._Node(e)
+    return self._make_position(self._root)
+  def _add_left(self, p, e):
+    """Create a new left child for Position p, storing element e.
+    Return the Position of new node.
+    Raise ValueError if Position p is invalid or p already has a left child.
+    """
+    node = self._validate(p)
+    if node._left is not None:
+      raise ValueError('Left child exists')
+    self._size += 1
+    node._left = self._Node(e, node)                  # node is its parent
+    return self._make_position(node._left)
+  def _add_right(self, p, e):
+    """Create a new right child for Position p, storing element e.
+    Return the Position of new node.
+    Raise ValueError if Position p is invalid or p already has a right child.
+    """
+    node = self._validate(p)
+    if node._right is not None:
+      raise ValueError('Right child exists')
+    self._size += 1
+    node._right = self._Node(e, node)                 # node is its parent
+    return self._make_position(node._right)
+  def _replace(self, p, e):
+    """Replace the element at position p with e, and return old element."""
+    node = self._validate(p)
+    old = node._element
+    node._element = e
+    return old
+  def _delete(self, p):
+    """Delete the node at Position p, and replace it with its child, if any.
+    Return the element that had been stored at Position p.
+    Raise ValueError if Position p is invalid or p has two children.
+    """
+    node = self._validate(p)
+    if self.num_children(p) == 2:
+      raise ValueError('Position has two children')
+    child = node._left if node._left else node._right  # might be None
+    if child is not None:
+      child._parent = node._parent   # child's grandparent becomes parent
+    if node is self._root:
+      self._root = child             # child becomes root
+    else:
+      parent = node._parent
+      if node is parent._left:
+        parent._left = child
+      else:
+        parent._right = child
+    self._size -= 1
+    node._parent = node              # convention for deprecated node
+    return node._element
+  def _attach(self, p, t1, t2):
+    """Attach trees t1 and t2, respectively, as the left and right subtrees of the external Position p.
+    As a side effect, set t1 and t2 to empty.
+    Raise TypeError if trees t1 and t2 do not match type of this tree.
+    Raise ValueError if Position p is invalid or not external.
+    """
+    node = self._validate(p)
+    if not self.is_leaf(p):
+      raise ValueError('position must be leaf')
+    if not type(self) is type(t1) is type(t2):    # all 3 trees must be same type
+      raise TypeError('Tree types must match')
+    self._size += len(t1) + len(t2)
+    if not t1.is_empty():         # attached t1 as left subtree of node
+      t1._root._parent = node
+      node._left = t1._root
+      t1._root = None             # set t1 instance to empty
+      t1._size = 0
+    if not t2.is_empty():         # attached t2 as right subtree of node
+      t2._root._parent = node
+      node._right = t2._root
+      t2._root = None             # set t2 instance to empty
+      t2._size = 0
diff --git a/lib/linked_queue.py b/lib/linked_queue.py
new file mode 100644
index 0000000..978b35b
--- /dev/null
+++ b/lib/linked_queue.py
@@ -0,0 +1,77 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#from ..exceptions import Empty
+class LinkedQueue:
+  """FIFO queue implementation using a singly linked list for storage."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node:
+    """Lightweight, nonpublic class for storing a singly linked node."""
+    __slots__ = '_element', '_next'         # streamline memory usage
+    def __init__(self, element, next):
+      self._element = element
+      self._next = next
+  #------------------------------- queue methods -------------------------------
+  def __init__(self):
+    """Create an empty queue."""
+    self._head = None
+    self._tail = None
+    self._size = 0                          # number of queue elements
+  def __len__(self):
+    """Return the number of elements in the queue."""
+    return self._size
+  def is_empty(self):
+    """Return True if the queue is empty."""
+    return self._size == 0
+  def first(self):
+    """Return (but do not remove) the element at the front of the queue.
+    Raise Empty exception if the queue is empty.
+    """
+    if self.is_empty():
+      raise Empty('Queue is empty')
+    return self._head._element              # front aligned with head of list
+  def dequeue(self):
+    """Remove and return the first element of the queue (i.e., FIFO).
+    Raise Empty exception if the queue is empty.
+    """
+    if self.is_empty():
+      raise Empty('Queue is empty')
+    answer = self._head._element
+    self._head = self._head._next
+    self._size -= 1
+    if self.is_empty():                     # special case as queue is empty
+      self._tail = None                     # removed head had been the tail
+    return answer
+  def enqueue(self, e):
+    """Add an element to the back of queue."""
+    newest = self._Node(e, None)            # node will be new tail node
+    if self.is_empty():
+      self._head = newest                   # special case: previously empty
+    else:
+      self._tail._next = newest
+    self._tail = newest                     # update reference to tail node
+    self._size += 1
diff --git a/lib/map_base.py b/lib/map_base.py
new file mode 100644
index 0000000..a93c1f7
--- /dev/null
+++ b/lib/map_base.py
@@ -0,0 +1,38 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from collections import MutableMapping
+class MapBase(MutableMapping):
+  """Our own abstract base class that includes a nonpublic _Item class."""
+  #------------------------------- nested _Item class -------------------------------
+  class _Item:
+    """Lightweight composite to store key-value pairs as map items."""
+    __slots__ = '_key', '_value'
+    def __init__(self, k, v):
+      self._key = k
+      self._value = v
+    def __eq__(self, other):
+      return self._key == other._key   # compare items based on their keys
+    def __ne__(self, other):
+      return not (self == other)       # opposite of __eq__
+    def __lt__(self, other):
+      return self._key < other._key    # compare items based on their keys
diff --git a/lib/red_black_tree.py b/lib/red_black_tree.py
new file mode 100644
index 0000000..c5905c7
--- /dev/null
+++ b/lib/red_black_tree.py
@@ -0,0 +1,112 @@
+# Copyright 2015, Chen Sun
+# Based on source code copyright by 2013, Michael H. Goldwasser
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.binary_search_tree import TreeMap
+class RedBlackTreeMap(TreeMap):
+  """Sorted map implementation using a red-black tree."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node(TreeMap._Node):
+    """Node class for red-black tree maintains bit that denotes color."""
+    __slots__ = '_red'     # add additional data member to the Node class
+    def __init__(self, element, parent=None, left=None, right=None):
+      TreeMap._Node.__init__(self, element, parent, left, right)
+      self._red = True     # new node red by default
+  #------------------------- positional-based utility methods -------------------------
+  # we consider a nonexistent child to be trivially black
+  def _set_red(self, p): p._node._red = True
+  def _set_black(self, p): p._node._red = False
+  def _set_color(self, p, make_red): p._node._red = make_red
+  def _is_red(self, p): return p is not None and p._node._red
+  def _is_red_leaf(self, p): return self._is_red(p) and self.is_leaf(p)
+  def _get_red_child(self, p):
+    """Return a red child of p (or None if no such child)."""
+    for child in (self.left(p), self.right(p)):
+      if self._is_red(child):
+        return child
+    return None
+  #------------------------- support for insertions -------------------------
+  def _rebalance_insert(self, p):
+    self._resolve_red(p)                         # new node is always red
+  def _resolve_red(self, p):
+    if self.is_root(p):
+      self._set_black(p)                         # make root black
+    else:
+      parent = self.parent(p)
+      if self._is_red(parent):                   # double red problem
+        uncle = self.sibling(parent)
+        if not self._is_red(uncle):              # Case 1: misshapen 4-node
+          middle = self._restructure(p)          # do trinode restructuring
+          self._set_black(middle)                # and then fix colors
+          self._set_red(self.left(middle))
+          self._set_red(self.right(middle))
+        else:                                    # Case 2: overfull 5-node
+          grand = self.parent(parent)
+          self._set_red(grand)                   # grandparent becomes red
+          self._set_black(self.left(grand))      # its children become black
+          self._set_black(self.right(grand))
+          self._resolve_red(grand)               # recur at red grandparent
+  #------------------------- support for deletions -------------------------
+  def _rebalance_delete(self, p):
+    if len(self) == 1:
+      self._set_black(self.root())  # special case: ensure that root is black
+    elif p is not None:
+      n = self.num_children(p)
+      if n == 1:                    # deficit exists unless child is a red leaf
+        c = next(self.children(p))
+        if not self._is_red_leaf(c):
+          self._fix_deficit(p, c)
+      elif n == 2:                  # removed black node with red child
+        if self._is_red_leaf(self.left(p)):
+          self._set_black(self.left(p))
+        else:
+          self._set_black(self.right(p))
+  def _fix_deficit(self, z, y):
+    """Resolve black deficit at z, where y is the root of z's heavier subtree."""
+    if not self._is_red(y): # y is black; will apply Case 1 or 2
+      x = self._get_red_child(y)
+      if x is not None: # Case 1: y is black and has red child x; do "transfer"
+        old_color = self._is_red(z)
+        middle = self._restructure(x)
+        self._set_color(middle, old_color)   # middle gets old color of z
+        self._set_black(self.left(middle))   # children become black
+        self._set_black(self.right(middle))
+      else: # Case 2: y is black, but no red children; recolor as "fusion"
+        self._set_red(y)
+        if self._is_red(z):
+          self._set_black(z)                 # this resolves the problem
+        elif not self.is_root(z):
+          self._fix_deficit(self.parent(z), self.sibling(z)) # recur upward
+    else: # Case 3: y is red; rotate misaligned 3-node and repeat
+      self._rotate(y)
+      self._set_black(y)
+      self._set_red(z)
+      if z == self.right(y):
+        self._fix_deficit(z, self.left(z))
+      else:
+        self._fix_deficit(z, self.right(z))
diff --git a/lib/tree.py b/lib/tree.py
new file mode 100644
index 0000000..921ba67
--- /dev/null
+++ b/lib/tree.py
@@ -0,0 +1,151 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.linked_queue import LinkedQueue #LinkedQueue is only used for bfs
+import collections
+class Tree:
+  """Abstract base class representing a tree structure."""
+  #------------------------------- nested Position class -------------------------------
+  class Position:
+    """An abstraction representing the location of a single element within a tree.
+    Note that two position instaces may represent the same inherent location in a tree.
+    Therefore, users should always rely on syntax 'p == q' rather than 'p is q' when testing
+    equivalence of positions.
+    we define a tree ADT using the concept of apositionas an abstraction for a node of a tree
+    """
+    def element(self):
+      """Return the element stored at this Position."""
+      raise NotImplementedError('must be implemented by subclass')
+    def __eq__(self, other):
+      """Return True if other Position represents the same location."""
+      raise NotImplementedError('must be implemented by subclass')
+    def __ne__(self, other):
+      """Return True if other does not represent the same location."""
+      return not (self == other)            # opposite of __eq__
+  # ---------- abstract methods that concrete subclass must support ----------
+  def root(self):
+    """Return Position representing the tree's root (or None if empty)."""
+    raise NotImplementedError('must be implemented by subclass')
+  def parent(self, p):
+    """Return Position representing p's parent (or None if p is root)."""
+    raise NotImplementedError('must be implemented by subclass')
+  def num_children(self, p):
+    """Return the number of children that Position p has."""
+    raise NotImplementedError('must be implemented by subclass')
+  def children(self, p):
+    """Generate an iteration of Positions representing p's children."""
+    raise NotImplementedError('must be implemented by subclass')
+  def __len__(self):
+    """Return the total number of elements in the tree."""
+    raise NotImplementedError('must be implemented by subclass')
+  # ---------- concrete methods implemented in this class ----------
+  def is_root(self, p):
+    """Return True if Position p represents the root of the tree."""
+    return self.root() == p
+  def is_leaf(self, p):
+    """Return True if Position p does not have any children."""
+    return self.num_children(p) == 0
+  def is_empty(self):
+    """Return True if the tree is empty."""
+    return len(self) == 0
+  def depth(self, p):
+    """Return the number of levels separating Position p from the root."""
+    if self.is_root(p):
+      return 0
+    else:
+      return 1 + self.depth(self.parent(p))
+  def _height1(self):                 # works, but O(n^2) worst-case time
+    """Return the height of the tree."""
+    return max(self.depth(p) for p in self.positions() if self.is_leaf(p))
+  def _height2(self, p):                  # time is linear in size of subtree
+    """Return the height of the subtree rooted at Position p."""
+    if self.is_leaf(p):
+      return 0
+    else:
+      return 1 + max(self._height2(c) for c in self.children(p))
+  def height(self, p=None):
+    """Return the height of the subtree rooted at Position p.
+    If p is None, return the height of the entire tree.
+    """
+    if p is None:
+      p = self.root()
+    return self._height2(p)        # start _height2 recursion
+  def __iter__(self):
+    """Generate an iteration of the tree's elements."""
+    for p in self.positions():                        # use same order as positions()
+      yield p.element()                               # but yield each element
+  def positions(self):
+    """Generate an iteration of the tree's positions."""
+    return self.preorder()                            # return entire preorder iteration
+  def preorder(self):
+    """Generate a preorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_preorder(self.root()):  # start recursion
+        yield p
+  def _subtree_preorder(self, p):
+    """Generate a preorder iteration of positions in subtree rooted at p."""
+    yield p                                           # visit p before its subtrees
+    for c in self.children(p):                        # for each child c
+      for other in self._subtree_preorder(c):         # do preorder of c's subtree
+        yield other                                   # yielding each to our caller
+  def postorder(self):
+    """Generate a postorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_postorder(self.root()):  # start recursion
+        yield p
+  def _subtree_postorder(self, p):
+    """Generate a postorder iteration of positions in subtree rooted at p."""
+    for c in self.children(p):                        # for each child c
+      for other in self._subtree_postorder(c):        # do postorder of c's subtree
+        yield other                                   # yielding each to our caller
+    yield p                                           # visit p after its subtrees
+  def breadthfirst(self):
+    """Generate a breadth-first iteration of the positions of the tree."""
+    if not self.is_empty():
+      fringe = LinkedQueue()             # known positions not yet yielded
+      fringe.enqueue(self.root())        # starting with the root
+      while not fringe.is_empty():
+        p = fringe.dequeue()             # remove from front of the queue
+        yield p                          # report this position
+        for c in self.children(p):
+          fringe.enqueue(c)              # add children to back of queue
diff --git a/license.txt b/license.txt
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/license.txt
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  16. Limitation of Liability.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
diff --git a/makefile b/makefile
new file mode 100644
index 0000000..cc0e08a
--- /dev/null
+++ b/makefile
@@ -0,0 +1,12 @@
+all: vm
+.PHONY: all vm clean
+	$(MAKE) -C src all
+	chmod +x varmatch
+	chmod +x purify
+	chmod +x filter
+	$(MAKE) -C src clean
diff --git a/purify b/purify
new file mode 100755
index 0000000..837cd28
--- /dev/null
+++ b/purify
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+from sys import argv
+import sys
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('-i', '--input', help='original vcf file')
+parser.add_argument('-o', '--output', help='purified vcf file')
+parser.add_argument('-g', '--genome', help='genome FASTA file')
+args = parser.parse_args()
+refFilename = args.genome
+vcfFilename = args.input
+purifyFilename = args.output
+def read_reference(refFilename):
+    sequence = ''
+    refFile = open(refFilename)
+    for line in refFile.readlines():
+        if line.startswith(">"):
+            continue
+        line = line.strip()
+        sequence += line
+    refFile.close()
+    return sequence
+def main():
+    print ('\t[input] ' + args.input)
+    print ('\t[genome]' + args.genome)
+    vcfFile = open(vcfFilename)
+    purifyFile = open(purifyFilename, "w")
+    reference = read_reference(refFilename)
+    num = 0
+    heteNum = 0
+    for line in vcfFile.readlines():
+        if line.startswith("#"):
+            purifyFile.write(line)
+            continue
+        columns = line.split("\t")
+        chrom = columns[0]
+        pos = int(columns[1]) - 1
+        reservedRef = columns[3]
+        reservedAlt = columns[4]
+        ref = columns[3].upper()
+        alt = columns[4].upper()
+        end = pos + len(ref)
+        refSeq = reference[pos:end].upper()
+        info = columns[7].split(";")[2]
+        infoId = info.split("=")[1]
+        if ref != refSeq:
+            num += 1
+            continue
+        else:
+            purifyFile.write(line)
+    vcfFile.close()
+    purifyFile.close()
+    if(num == 0):
+        print ('\t all variants in input vcf file match genome sequence')
+    else:
+        print ('\t[Warning] ' + str(num) + ' variants do not match genome sequence and removed!')
+    print ('\t[output]' + args.output)
+if __name__ == '__main__':
+    main()
diff --git a/py/lib/__init__.py b/py/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/py/lib/binary_search_tree.py b/py/lib/binary_search_tree.py
new file mode 100644
index 0000000..d5a4c17
--- /dev/null
+++ b/py/lib/binary_search_tree.py
@@ -0,0 +1,445 @@
+# Copyright 2015, Chen Sun
+# Based on source code copyright by 2013, Michael H. Goldwasser
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.linked_binary_tree import LinkedBinaryTree
+from lib.map_base import MapBase
+import copy
+class TreeMap(LinkedBinaryTree, MapBase):
+  """Sorted map implementation using a binary search tree."""
+  #---------------------------- override Position class ----------------------------
+  class Position(LinkedBinaryTree.Position):
+    def key(self):
+      """Return key of map's key-value pair."""
+      return self.element()._key
+    def value(self):
+      """Return value of map's key-value pair."""
+      return self.element()._value
+  #------------------------------- nonpublic utilities -------------------------------
+  def _subtree_search(self, p, k):
+    """Return Position of p's subtree having key k, or last node searched."""
+    #print(k)
+    if k == p.key():                                   # found match
+      return p                                         
+    elif k < p.key():                                  # search left subtree
+      if self.left(p) is not None:
+        return self._subtree_search(self.left(p), k)   
+    else:                                              # search right subtree
+      if self.right(p) is not None:
+        return self._subtree_search(self.right(p), k)
+    return p                                           # unsuccessful search
+  #create a subtree_search help function
+  def _search_trace(self, p, k):
+    """Return all the Position that has been searched."""
+    yield p
+    while p is not None and k != p.key():
+      if k < p.key():
+        p = self.left(p)
+        yield p
+      else:
+        p = self.right(p)
+        yield p
+  def _subtree_first_position(self, p):
+    """Return Position of first item in subtree rooted at p."""
+    walk = p
+    while self.left(walk) is not None:                 # keep walking left
+      walk = self.left(walk)
+    return walk
+  def _subtree_last_position(self, p):
+    """Return Position of last item in subtree rooted at p."""
+    walk = p
+    while self.right(walk) is not None:                # keep walking right
+      walk = self.right(walk)
+    return walk
+  #--------------------- public methods providing "positional" support ---------------------
+  def first(self):
+    """Return the first Position in the tree (or None if empty)."""
+    return self._subtree_first_position(self.root()) if len(self) > 0 else None
+  def last(self):
+    """Return the last Position in the tree (or None if empty)."""
+    return self._subtree_last_position(self.root()) if len(self) > 0 else None
+  def before(self, p):
+    """Return the Position just before p in the natural order.
+    Return None if p is the first position.
+    """
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.left(p):
+      return self._subtree_last_position(self.left(p))
+    else:
+      # walk upward
+      walk = p
+      above = self.parent(walk)
+      while above is not None and walk == self.left(above):
+        walk = above
+        above = self.parent(walk)
+      return above
+  def after(self, p):
+    """Return the Position just after p in the natural order.
+    Return None if p is the last position.
+    """
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.right(p):
+      return self._subtree_first_position(self.right(p))
+    else:
+      walk = p
+      above = self.parent(walk)
+      while above is not None and walk == self.right(above):
+        walk = above
+        above = self.parent(walk)
+      return above
+  def find_position(self, k):
+    """Return position with key k, or else neighbor (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self._subtree_search(self.root(), k)
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+      return p
+  def find_nearest(self, k):
+    """Return position with key k, or else the nearest position k' (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          #print(p.key(), abs(p.key()-k), shortest_distance)
+          abs_distance = abs(p.key() - k)
+          if abs_distance < shortest_distance:
+            shortest_distance = abs_distance
+            nearest_p = p
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def find_nearest_small(self, k):
+    """Return position with key k, or else the nearest position with k' < k (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          distance = k - p.key()
+          if distance >= 0 and distance < shortest_distance:
+            shortest_distance = distance
+            nearest_p = p     
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def find_nearest_large(self, k):
+    """Return position with key k, or else the nearest position with k' > k (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      shortest_distance = 3000000000
+      nearest_p = None
+      for p in self._search_trace(self.root(), k):
+        if p is not None:
+          distance = p.key()-k
+          if distance >= 0 and distance < shortest_distance:
+            shortest_distance = distance
+            nearest_p = p     
+      self._rebalance_access(nearest_p)                  # hook for balanced tree subclasses
+      return nearest_p
+  def delete(self, p):
+    """Remove the item at given Position."""
+    self._validate(p)                            # inherited from LinkedBinaryTree
+    if self.left(p) and self.right(p):           # p has two children
+      replacement = self._subtree_last_position(self.left(p))
+      self._replace(p, replacement.element())    # from LinkedBinaryTree
+      p =  replacement
+    # now p has at most one child
+    parent = self.parent(p)
+    self._delete(p)                              # inherited from LinkedBinaryTree
+    self._rebalance_delete(parent)               # if root deleted, parent is None
+  def keys(self):
+    key_list = []
+    p = self.first()
+    while p is not None:
+      key_list.append(p.key())
+      p = self.after(p)
+    return key_list
+  #--------------------- public methods for (standard) map interface ---------------------
+  def __getitem__(self, k):
+    """Return value associated with key k (raise KeyError if not found)."""
+    if self.is_empty():
+      raise KeyError('Key Error: ' + repr(k))
+    else:
+      p = self._subtree_search(self.root(), k)
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+      if k != p.key():
+        raise KeyError('Key Error: ' + repr(k))
+      return p.value()
+  def __setitem__(self, k, v):
+    """Assign value v to key k, overwriting existing value if present."""
+    if self.is_empty():
+      leaf = self._add_root(self._Item(k,v))     # from LinkedBinaryTree
+    else:
+      p = self._subtree_search(self.root(), k)
+      if p.key() == k:
+        p.element()._value = v                   # replace existing item's value
+        self._rebalance_access(p)                # hook for balanced tree subclasses
+        return
+      else:
+        item = self._Item(k,v)
+        if p.key() < k:
+          leaf = self._add_right(p, item)        # inherited from LinkedBinaryTree
+        else:
+          leaf = self._add_left(p, item)         # inherited from LinkedBinaryTree
+    self._rebalance_insert(leaf)                 # hook for balanced tree subclasses
+  def __delitem__(self, k):
+    """Remove item associated with key k (raise KeyError if not found)."""
+    if not self.is_empty():
+      p = self._subtree_search(self.root(), k)
+      if k == p.key():
+        self.delete(p)                           # rely on positional version
+        return                                   # successful deletion complete
+      self._rebalance_access(p)                  # hook for balanced tree subclasses
+    raise KeyError('Key Error: ' + repr(k))
+  def __iter__(self):
+    """Generate an iteration of all keys in the map in order."""
+    p = self.first()
+    while p is not None:
+      yield p.key()
+      p = self.after(p)
+  #--------------------- public methods for sorted map interface ---------------------
+  def __reversed__(self):
+    """Generate an iteration of all keys in the map in reverse order."""
+    p = self.last()
+    while p is not None:
+      yield p.key()
+      p = self.before(p)
+  def find_min(self):
+    """Return (key,value) pair with minimum key (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self.first()
+      return (p.key(), p.value())
+  def find_max(self):
+    """Return (key,value) pair with maximum key (or None if empty)."""
+    if self.is_empty():
+      return None
+    else:
+      p = self.last()
+      return (p.key(), p.value())
+  def find_le(self, k):
+    """Return (key,value) pair with greatest key less than or equal to k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if k < p.key():
+        p = self.before(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_lt(self, k):
+    """Return (key,value) pair with greatest key strictly less than k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if not p.key() < k:
+        p = self.before(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_ge(self, k):
+    """Return (key,value) pair with least key greater than or equal to k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)                   # may not find exact match
+      if p.key() < k:                             # p's key is too small
+        p = self.after(p)
+      return (p.key(), p.value()) if p is not None else None
+  def find_gt(self, k):
+    """Return (key,value) pair with least key strictly greater than k.
+    Return None if there does not exist such a key.
+    """
+    if self.is_empty():
+      return None
+    else:
+      p = self.find_position(k)
+      if not k < p.key():                   
+        p = self.after(p)
+      return (p.key(), p.value()) if p is not None else None
+  def linear_range_search(self, position, start, stop):
+    """
+    Iterate all position such that start < position.key < stop
+    Mind: linear_search function only return Position, not key value pair.
+    If start is None, searching begins from self.first()
+    If start is None, iteration begins with minimum key of map.
+    If end is None, iteration continues through the maximum key of map.
+    """  
+    if not self.is_empty():
+      if position is not None:
+        p = position
+      else:
+        p = self.first()
+      while p is not None and (stop is None or p.key() < stop):
+        if p.key() >= start:
+          yield(p)
+        p = self.after(p)
+  def find_range(self, start, stop):
+    """Iterate all (key,value) pairs such that start <= key < stop.
+    If start is None, iteration begins with minimum key of map.
+    If stop is None, iteration continues through the maximum key of map.
+    """
+    if not self.is_empty():
+      if start is None:
+        p = self.first()
+      else:
+        # we initialize p with logic similar to find_ge
+        p = self.find_position(start)
+        if p.key() < start:
+          p = self.after(p)
+      while p is not None and (stop is None or p.key() < stop):
+        yield (p.key(), p.value())
+        p = self.after(p)
+  #--------------------- hooks used by subclasses to balance a tree ---------------------
+  def _rebalance_insert(self, p):
+    """Call to indicate that position p is newly added."""
+    pass
+  def _rebalance_delete(self, p):
+    """Call to indicate that a child of p has been removed."""
+    pass
+  def _rebalance_access(self, p):
+    """Call to indicate that position p was recently accessed."""
+    pass
+  #--------------------- nonpublic methods to support tree balancing ---------------------
+  def _relink(self, parent, child, make_left_child):
+    """Relink parent node with child node (we allow child to be None)."""
+    if make_left_child:                           # make it a left child
+      parent._left = child
+    else:                                         # make it a right child
+      parent._right = child
+    if child is not None:                         # make child point to parent
+      child._parent = parent
+  def _rotate(self, p):
+    """Rotate Position p above its parent.
+    Switches between these configurations, depending on whether p==a or p==b.
+          b                  a
+         / \                /  \
+        a  t2             t0   b
+       / \                     / \
+      t0  t1                  t1  t2
+    Caller should ensure that p is not the root.
+    """
+    """Rotate Position p above its parent."""
+    x = p._node
+    y = x._parent                                 # we assume this exists
+    z = y._parent                                 # grandparent (possibly None)
+    if z is None:            
+      self._root = x                              # x becomes root
+      x._parent = None        
+    else:
+      self._relink(z, x, y == z._left)            # x becomes a direct child of z
+    # now rotate x and y, including transfer of middle subtree
+    if x == y._left:
+      self._relink(y, x._right, True)             # x._right becomes left child of y
+      self._relink(x, y, False)                   # y becomes right child of x
+    else:
+      self._relink(y, x._left, False)             # x._left becomes right child of y
+      self._relink(x, y, True)                    # y becomes left child of x
+  def _restructure(self, x):
+    """Perform a trinode restructure among Position x, its parent, and its grandparent.
+    Return the Position that becomes root of the restructured subtree.
+    Assumes the nodes are in one of the following configurations:
+        z=a                 z=c           z=a               z=c  
+       /  \                /  \          /  \              /  \  
+      t0  y=b             y=b  t3       t0   y=c          y=a  t3 
+         /  \            /  \               /  \         /  \     
+        t1  x=c         x=a  t2            x=b  t3      t0   x=b    
+           /  \        /  \               /  \              /  \    
+          t2  t3      t0  t1             t1  t2            t1  t2   
+    The subtree will be restructured so that the node with key b becomes its root.
+              b
+            /   \
+          a       c
+         / \     / \
+        t0  t1  t2  t3
+    Caller should ensure that x has a grandparent.
+    """
+    """Perform trinode restructure of Position x with parent/grandparent."""
+    y = self.parent(x)
+    z = self.parent(y)
+    if (x == self.right(y)) == (y == self.right(z)):  # matching alignments
+      self._rotate(y)                                 # single rotation (of y)
+      return y                                        # y is new subtree root
+    else:                                             # opposite alignments
+      self._rotate(x)                                 # double rotation (of x)     
+      self._rotate(x)
+      return x                                        # x is new subtree root
diff --git a/py/lib/binary_tree.py b/py/lib/binary_tree.py
new file mode 100644
index 0000000..56ae9e7
--- /dev/null
+++ b/py/lib/binary_tree.py
@@ -0,0 +1,74 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.tree import Tree
+class BinaryTree(Tree):
+  """Abstract base class representing a binary tree structure."""
+  # --------------------- additional abstract methods ---------------------
+  def left(self, p):
+    """Return a Position representing p's left child.
+    Return None if p does not have a left child.
+    """
+    raise NotImplementedError('must be implemented by subclass')
+  def right(self, p):
+    """Return a Position representing p's right child.
+    Return None if p does not have a right child.
+    """
+    raise NotImplementedError('must be implemented by subclass')
+  # ---------- concrete methods implemented in this class ----------
+  def sibling(self, p):
+    """Return a Position representing p's sibling (or None if no sibling)."""
+    parent = self.parent(p)
+    if parent is None:                    # p must be the root
+      return None                         # root has no sibling
+    else:
+      if p == self.left(parent):
+        return self.right(parent)         # possibly None
+      else:
+        return self.left(parent)          # possibly None
+  def children(self, p):
+    """Generate an iteration of Positions representing p's children."""
+    if self.left(p) is not None:
+      yield self.left(p)
+    if self.right(p) is not None:
+      yield self.right(p)
+  def inorder(self):
+    """Generate an inorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_inorder(self.root()):
+        yield p
+  def _subtree_inorder(self, p):
+    """Generate an inorder iteration of positions in subtree rooted at p."""
+    if self.left(p) is not None:          # if left child exists, traverse its subtree
+      for other in self._subtree_inorder(self.left(p)):
+        yield other
+    yield p                               # visit p between its subtrees
+    if self.right(p) is not None:         # if right child exists, traverse its subtree
+      for other in self._subtree_inorder(self.right(p)):
+        yield other
+  # override inherited version to make inorder the default
+  def positions(self):
+    """Generate an iteration of the tree's positions."""
+    return self.inorder()                 # make inorder the default
diff --git a/py/lib/linked_binary_tree.py b/py/lib/linked_binary_tree.py
new file mode 100644
index 0000000..a6cc58b
--- /dev/null
+++ b/py/lib/linked_binary_tree.py
@@ -0,0 +1,196 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.binary_tree import BinaryTree
+class LinkedBinaryTree(BinaryTree):
+  """Linked representation of a binary tree structure."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node:
+    """Lightweight, nonpublic class for storing a node."""
+    __slots__ = '_element', '_parent', '_left', '_right' # streamline memory usage
+    def __init__(self, element, parent=None, left=None, right=None):
+      self._element = element
+      self._parent = parent
+      self._left = left
+      self._right = right
+  #-------------------------- nested Position class --------------------------
+  class Position(BinaryTree.Position):
+    """An abstraction representing the location of a single element."""
+    def __init__(self, container, node):
+      """Constructor should not be invoked by user."""
+      self._container = container
+      self._node = node
+    def element(self):
+      """Return the element stored at this Position."""
+      return self._node._element
+    def __eq__(self, other):
+      """Return True if other is a Position representing the same location."""
+      return type(other) is type(self) and other._node is self._node
+  #------------------------------- utility methods -------------------------------
+  def _validate(self, p):
+    """Return associated node, if position is valid."""
+    if not isinstance(p, self.Position):
+      raise TypeError('p must be proper Position type')
+    if p._container is not self:
+      raise ValueError('p does not belong to this container')
+    if p._node._parent is p._node:      # convention for deprecated nodes
+      raise ValueError('p is no longer valid')
+    return p._node
+  def _make_position(self, node):
+    """Return Position instance for given node (or None if no node)."""
+    return self.Position(self, node) if node is not None else None
+  #-------------------------- binary tree constructor --------------------------
+  def __init__(self):
+    """Create an initially empty binary tree."""
+    self._root = None
+    self._size = 0
+  #-------------------------- public accessors --------------------------
+  def __len__(self):
+    """Return the total number of elements in the tree."""
+    return self._size
+  def root(self):
+    """Return the root Position of the tree (or None if tree is empty)."""
+    return self._make_position(self._root)
+  def parent(self, p):
+    """Return the Position of p's parent (or None if p is root)."""
+    node = self._validate(p)
+    return self._make_position(node._parent)
+  def left(self, p):
+    """Return the Position of p's left child (or None if no left child)."""
+    node = self._validate(p)
+    return self._make_position(node._left)
+  def right(self, p):
+    """Return the Position of p's right child (or None if no right child)."""
+    node = self._validate(p)
+    return self._make_position(node._right)
+  def num_children(self, p):
+    """Return the number of children of Position p."""
+    node = self._validate(p)
+    count = 0
+    if node._left is not None:     # left child exists
+      count += 1
+    if node._right is not None:    # right child exists
+      count += 1
+    return count
+  #-------------------------- nonpublic mutators --------------------------
+  def _add_root(self, e):
+    """Place element e at the root of an empty tree and return new Position.
+    Raise ValueError if tree nonempty.
+    """
+    if self._root is not None:
+      raise ValueError('Root exists')
+    self._size = 1
+    self._root = self._Node(e)
+    return self._make_position(self._root)
+  def _add_left(self, p, e):
+    """Create a new left child for Position p, storing element e.
+    Return the Position of new node.
+    Raise ValueError if Position p is invalid or p already has a left child.
+    """
+    node = self._validate(p)
+    if node._left is not None:
+      raise ValueError('Left child exists')
+    self._size += 1
+    node._left = self._Node(e, node)                  # node is its parent
+    return self._make_position(node._left)
+  def _add_right(self, p, e):
+    """Create a new right child for Position p, storing element e.
+    Return the Position of new node.
+    Raise ValueError if Position p is invalid or p already has a right child.
+    """
+    node = self._validate(p)
+    if node._right is not None:
+      raise ValueError('Right child exists')
+    self._size += 1
+    node._right = self._Node(e, node)                 # node is its parent
+    return self._make_position(node._right)
+  def _replace(self, p, e):
+    """Replace the element at position p with e, and return old element."""
+    node = self._validate(p)
+    old = node._element
+    node._element = e
+    return old
+  def _delete(self, p):
+    """Delete the node at Position p, and replace it with its child, if any.
+    Return the element that had been stored at Position p.
+    Raise ValueError if Position p is invalid or p has two children.
+    """
+    node = self._validate(p)
+    if self.num_children(p) == 2:
+      raise ValueError('Position has two children')
+    child = node._left if node._left else node._right  # might be None
+    if child is not None:
+      child._parent = node._parent   # child's grandparent becomes parent
+    if node is self._root:
+      self._root = child             # child becomes root
+    else:
+      parent = node._parent
+      if node is parent._left:
+        parent._left = child
+      else:
+        parent._right = child
+    self._size -= 1
+    node._parent = node              # convention for deprecated node
+    return node._element
+  def _attach(self, p, t1, t2):
+    """Attach trees t1 and t2, respectively, as the left and right subtrees of the external Position p.
+    As a side effect, set t1 and t2 to empty.
+    Raise TypeError if trees t1 and t2 do not match type of this tree.
+    Raise ValueError if Position p is invalid or not external.
+    """
+    node = self._validate(p)
+    if not self.is_leaf(p):
+      raise ValueError('position must be leaf')
+    if not type(self) is type(t1) is type(t2):    # all 3 trees must be same type
+      raise TypeError('Tree types must match')
+    self._size += len(t1) + len(t2)
+    if not t1.is_empty():         # attached t1 as left subtree of node
+      t1._root._parent = node
+      node._left = t1._root
+      t1._root = None             # set t1 instance to empty
+      t1._size = 0
+    if not t2.is_empty():         # attached t2 as right subtree of node
+      t2._root._parent = node
+      node._right = t2._root
+      t2._root = None             # set t2 instance to empty
+      t2._size = 0
diff --git a/py/lib/linked_queue.py b/py/lib/linked_queue.py
new file mode 100644
index 0000000..978b35b
--- /dev/null
+++ b/py/lib/linked_queue.py
@@ -0,0 +1,77 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#from ..exceptions import Empty
+class LinkedQueue:
+  """FIFO queue implementation using a singly linked list for storage."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node:
+    """Lightweight, nonpublic class for storing a singly linked node."""
+    __slots__ = '_element', '_next'         # streamline memory usage
+    def __init__(self, element, next):
+      self._element = element
+      self._next = next
+  #------------------------------- queue methods -------------------------------
+  def __init__(self):
+    """Create an empty queue."""
+    self._head = None
+    self._tail = None
+    self._size = 0                          # number of queue elements
+  def __len__(self):
+    """Return the number of elements in the queue."""
+    return self._size
+  def is_empty(self):
+    """Return True if the queue is empty."""
+    return self._size == 0
+  def first(self):
+    """Return (but do not remove) the element at the front of the queue.
+    Raise Empty exception if the queue is empty.
+    """
+    if self.is_empty():
+      raise Empty('Queue is empty')
+    return self._head._element              # front aligned with head of list
+  def dequeue(self):
+    """Remove and return the first element of the queue (i.e., FIFO).
+    Raise Empty exception if the queue is empty.
+    """
+    if self.is_empty():
+      raise Empty('Queue is empty')
+    answer = self._head._element
+    self._head = self._head._next
+    self._size -= 1
+    if self.is_empty():                     # special case as queue is empty
+      self._tail = None                     # removed head had been the tail
+    return answer
+  def enqueue(self, e):
+    """Add an element to the back of queue."""
+    newest = self._Node(e, None)            # node will be new tail node
+    if self.is_empty():
+      self._head = newest                   # special case: previously empty
+    else:
+      self._tail._next = newest
+    self._tail = newest                     # update reference to tail node
+    self._size += 1
diff --git a/py/lib/map_base.py b/py/lib/map_base.py
new file mode 100644
index 0000000..a93c1f7
--- /dev/null
+++ b/py/lib/map_base.py
@@ -0,0 +1,38 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from collections import MutableMapping
+class MapBase(MutableMapping):
+  """Our own abstract base class that includes a nonpublic _Item class."""
+  #------------------------------- nested _Item class -------------------------------
+  class _Item:
+    """Lightweight composite to store key-value pairs as map items."""
+    __slots__ = '_key', '_value'
+    def __init__(self, k, v):
+      self._key = k
+      self._value = v
+    def __eq__(self, other):
+      return self._key == other._key   # compare items based on their keys
+    def __ne__(self, other):
+      return not (self == other)       # opposite of __eq__
+    def __lt__(self, other):
+      return self._key < other._key    # compare items based on their keys
diff --git a/py/lib/red_black_tree.py b/py/lib/red_black_tree.py
new file mode 100644
index 0000000..c5905c7
--- /dev/null
+++ b/py/lib/red_black_tree.py
@@ -0,0 +1,112 @@
+# Copyright 2015, Chen Sun
+# Based on source code copyright by 2013, Michael H. Goldwasser
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.binary_search_tree import TreeMap
+class RedBlackTreeMap(TreeMap):
+  """Sorted map implementation using a red-black tree."""
+  #-------------------------- nested _Node class --------------------------
+  class _Node(TreeMap._Node):
+    """Node class for red-black tree maintains bit that denotes color."""
+    __slots__ = '_red'     # add additional data member to the Node class
+    def __init__(self, element, parent=None, left=None, right=None):
+      TreeMap._Node.__init__(self, element, parent, left, right)
+      self._red = True     # new node red by default
+  #------------------------- positional-based utility methods -------------------------
+  # we consider a nonexistent child to be trivially black
+  def _set_red(self, p): p._node._red = True
+  def _set_black(self, p): p._node._red = False
+  def _set_color(self, p, make_red): p._node._red = make_red
+  def _is_red(self, p): return p is not None and p._node._red
+  def _is_red_leaf(self, p): return self._is_red(p) and self.is_leaf(p)
+  def _get_red_child(self, p):
+    """Return a red child of p (or None if no such child)."""
+    for child in (self.left(p), self.right(p)):
+      if self._is_red(child):
+        return child
+    return None
+  #------------------------- support for insertions -------------------------
+  def _rebalance_insert(self, p):
+    self._resolve_red(p)                         # new node is always red
+  def _resolve_red(self, p):
+    if self.is_root(p):
+      self._set_black(p)                         # make root black
+    else:
+      parent = self.parent(p)
+      if self._is_red(parent):                   # double red problem
+        uncle = self.sibling(parent)
+        if not self._is_red(uncle):              # Case 1: misshapen 4-node
+          middle = self._restructure(p)          # do trinode restructuring
+          self._set_black(middle)                # and then fix colors
+          self._set_red(self.left(middle))
+          self._set_red(self.right(middle))
+        else:                                    # Case 2: overfull 5-node
+          grand = self.parent(parent)
+          self._set_red(grand)                   # grandparent becomes red
+          self._set_black(self.left(grand))      # its children become black
+          self._set_black(self.right(grand))
+          self._resolve_red(grand)               # recur at red grandparent
+  #------------------------- support for deletions -------------------------
+  def _rebalance_delete(self, p):
+    if len(self) == 1:
+      self._set_black(self.root())  # special case: ensure that root is black
+    elif p is not None:
+      n = self.num_children(p)
+      if n == 1:                    # deficit exists unless child is a red leaf
+        c = next(self.children(p))
+        if not self._is_red_leaf(c):
+          self._fix_deficit(p, c)
+      elif n == 2:                  # removed black node with red child
+        if self._is_red_leaf(self.left(p)):
+          self._set_black(self.left(p))
+        else:
+          self._set_black(self.right(p))
+  def _fix_deficit(self, z, y):
+    """Resolve black deficit at z, where y is the root of z's heavier subtree."""
+    if not self._is_red(y): # y is black; will apply Case 1 or 2
+      x = self._get_red_child(y)
+      if x is not None: # Case 1: y is black and has red child x; do "transfer"
+        old_color = self._is_red(z)
+        middle = self._restructure(x)
+        self._set_color(middle, old_color)   # middle gets old color of z
+        self._set_black(self.left(middle))   # children become black
+        self._set_black(self.right(middle))
+      else: # Case 2: y is black, but no red children; recolor as "fusion"
+        self._set_red(y)
+        if self._is_red(z):
+          self._set_black(z)                 # this resolves the problem
+        elif not self.is_root(z):
+          self._fix_deficit(self.parent(z), self.sibling(z)) # recur upward
+    else: # Case 3: y is red; rotate misaligned 3-node and repeat
+      self._rotate(y)
+      self._set_black(y)
+      self._set_red(z)
+      if z == self.right(y):
+        self._fix_deficit(z, self.left(z))
+      else:
+        self._fix_deficit(z, self.right(z))
diff --git a/py/lib/tree.py b/py/lib/tree.py
new file mode 100644
index 0000000..921ba67
--- /dev/null
+++ b/py/lib/tree.py
@@ -0,0 +1,151 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from lib.linked_queue import LinkedQueue #LinkedQueue is only used for bfs
+import collections
+class Tree:
+  """Abstract base class representing a tree structure."""
+  #------------------------------- nested Position class -------------------------------
+  class Position:
+    """An abstraction representing the location of a single element within a tree.
+    Note that two position instaces may represent the same inherent location in a tree.
+    Therefore, users should always rely on syntax 'p == q' rather than 'p is q' when testing
+    equivalence of positions.
+    we define a tree ADT using the concept of apositionas an abstraction for a node of a tree
+    """
+    def element(self):
+      """Return the element stored at this Position."""
+      raise NotImplementedError('must be implemented by subclass')
+    def __eq__(self, other):
+      """Return True if other Position represents the same location."""
+      raise NotImplementedError('must be implemented by subclass')
+    def __ne__(self, other):
+      """Return True if other does not represent the same location."""
+      return not (self == other)            # opposite of __eq__
+  # ---------- abstract methods that concrete subclass must support ----------
+  def root(self):
+    """Return Position representing the tree's root (or None if empty)."""
+    raise NotImplementedError('must be implemented by subclass')
+  def parent(self, p):
+    """Return Position representing p's parent (or None if p is root)."""
+    raise NotImplementedError('must be implemented by subclass')
+  def num_children(self, p):
+    """Return the number of children that Position p has."""
+    raise NotImplementedError('must be implemented by subclass')
+  def children(self, p):
+    """Generate an iteration of Positions representing p's children."""
+    raise NotImplementedError('must be implemented by subclass')
+  def __len__(self):
+    """Return the total number of elements in the tree."""
+    raise NotImplementedError('must be implemented by subclass')
+  # ---------- concrete methods implemented in this class ----------
+  def is_root(self, p):
+    """Return True if Position p represents the root of the tree."""
+    return self.root() == p
+  def is_leaf(self, p):
+    """Return True if Position p does not have any children."""
+    return self.num_children(p) == 0
+  def is_empty(self):
+    """Return True if the tree is empty."""
+    return len(self) == 0
+  def depth(self, p):
+    """Return the number of levels separating Position p from the root."""
+    if self.is_root(p):
+      return 0
+    else:
+      return 1 + self.depth(self.parent(p))
+  def _height1(self):                 # works, but O(n^2) worst-case time
+    """Return the height of the tree."""
+    return max(self.depth(p) for p in self.positions() if self.is_leaf(p))
+  def _height2(self, p):                  # time is linear in size of subtree
+    """Return the height of the subtree rooted at Position p."""
+    if self.is_leaf(p):
+      return 0
+    else:
+      return 1 + max(self._height2(c) for c in self.children(p))
+  def height(self, p=None):
+    """Return the height of the subtree rooted at Position p.
+    If p is None, return the height of the entire tree.
+    """
+    if p is None:
+      p = self.root()
+    return self._height2(p)        # start _height2 recursion
+  def __iter__(self):
+    """Generate an iteration of the tree's elements."""
+    for p in self.positions():                        # use same order as positions()
+      yield p.element()                               # but yield each element
+  def positions(self):
+    """Generate an iteration of the tree's positions."""
+    return self.preorder()                            # return entire preorder iteration
+  def preorder(self):
+    """Generate a preorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_preorder(self.root()):  # start recursion
+        yield p
+  def _subtree_preorder(self, p):
+    """Generate a preorder iteration of positions in subtree rooted at p."""
+    yield p                                           # visit p before its subtrees
+    for c in self.children(p):                        # for each child c
+      for other in self._subtree_preorder(c):         # do preorder of c's subtree
+        yield other                                   # yielding each to our caller
+  def postorder(self):
+    """Generate a postorder iteration of positions in the tree."""
+    if not self.is_empty():
+      for p in self._subtree_postorder(self.root()):  # start recursion
+        yield p
+  def _subtree_postorder(self, p):
+    """Generate a postorder iteration of positions in subtree rooted at p."""
+    for c in self.children(p):                        # for each child c
+      for other in self._subtree_postorder(c):        # do postorder of c's subtree
+        yield other                                   # yielding each to our caller
+    yield p                                           # visit p after its subtrees
+  def breadthfirst(self):
+    """Generate a breadth-first iteration of the positions of the tree."""
+    if not self.is_empty():
+      fringe = LinkedQueue()             # known positions not yet yielded
+      fringe.enqueue(self.root())        # starting with the root
+      while not fringe.is_empty():
+        p = fringe.dequeue()             # remove from front of the queue
+        yield p                          # report this position
+        for c in self.children(p):
+          fringe.enqueue(c)              # add children to back of queue
diff --git a/py/vcfcompare.py b/py/vcfcompare.py
new file mode 100755
index 0000000..7d23f22
--- /dev/null
+++ b/py/vcfcompare.py
@@ -0,0 +1,1098 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Author: Chen Sun(chensun at cse.psu.edu)
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+#elif sys.hexversion > 0x03000000:
+#    print ("python 3")
+import subprocess
+import argparse
+import os
+import copy
+from lib.red_black_tree import RedBlackTreeMap
+import numpy
+import scipy.cluster.hierarchy as hcluster
+import itertools
+# for profile memory usage
+#from memory_profiler import profile
+citation = 'About algorithm used in VCF-Compare, please refer to "Method for Cross-Validating Variant Call Set" Section in our paper.'+'\n Please cite our paper.'
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-r', '--reference', required=True, help = 'reference vcf file path, usually larger than query vcf file')
+parser.add_argument('-q', '--query', required=True, help = 'query vcf file path')
+parser.add_argument('-g', '--genome', required=True, help= 'reference genome file path, fasta file format')
+parser.add_argument('-p', '--false_positive', help='false positive, i.e. mismatch vcf entries in query vcf file, default=false_positive.vcf', default='false_positive.vcf')
+parser.add_argument('-n', '--false_negative', help='false negative, i.e. mismatch vcf entries in reference vcf file, default=false_negative.vcf', default='false_negative.vcf')
+#parser.add_argument('-t', '--true_positive', help='true positive bed file position', default='true_positive.bed')
+parser.add_argument('-o', '--output', help='output matched variants in stage 2 and 3, default=multi_match.out', default='multi_match.out')
+parser.add_argument('-d', '--direct_search', help='if activate, only perform stage 1, default=not activate', action = 'store_true')
+parser.add_argument('-c', '--chr', help='chromosome name or id, used for parallel multi genome analysis', default='.')
+parser.add_argument('-s', '--stat', help='append statistics result into a file, useful for parallel multi genome analysis', default='stat.txt')
+args = parser.parse_args()
+#match_set = []
+#matched_quality_set = []
+#refPos_quality = {}
+######################### for debug ###########################
+refPos_vcfEntry = {}
+quePos_vcfEntry = {}
+#ref_match_total = set()
+#que_match_total = set()
+def direct_search(refPos_snp, quePos_snp):
+    global ref_match_total
+    global que_match_total
+    delList = []
+    num = 0
+    for key in quePos_snp:
+        if key in refPos_snp:
+            if refPos_snp[key] == quePos_snp[key]:
+                delList.append(key)
+                #match_set.append(key)
+                num += 1
+                #ref_match_total.add(key)
+                #que_match_total.add(key)
+    match_file = open('direct_search.txt', 'w')
+    for key in delList:
+        match_string = str(key) + ',' + str(refPos_snp[key]) + '\t' + str(key) + ',' + str(quePos_snp[key]) + '\n'
+        match_file.write(match_string)
+        #matched_quality_set.append(refPos_quality[key])
+        refPos_snp.pop(key, None) # delete value with key
+        quePos_snp.pop(key, None)
+    match_file.close()
+    #with open('matched_quality.txt', 'w') as quality:
+    #    for q in matched_quality_set:
+    #        quality.write(str(q)+'\n')
+    #print ("direct search found:", num)
+def modify_sequence(sequence, pos, snpSet):
+    if len(snpSet) != 3:
+        print ("Error: snp set size not right.")
+    ref = snpSet[1]
+    alt = snpSet[2]
+    if sequence[pos:pos+len(ref)].upper() != ref.upper():
+        pass
+    result = sequence[:pos] + alt + sequence[pos+len(ref):]
+    return result
+def near_search(refPos_snp, quePos_snp, genome, blockSize):
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    for key in quePos_snp:
+        num += 1
+        ref_element = refPos_snp.find_nearest(key) # return a position
+        ref_snp = ref_element.value()
+        que_snp = quePos_snp[key]
+        refPos = ref_element.key()
+        quePos = key
+        if abs(refPos-key) > blockSize:
+            continue
+        if ref_snp[0] != que_snp[0]:
+            continue
+        #get the substring
+        seqStart = min(key, refPos)-100
+        if seqStart < 0:
+            seqStart = 0
+        seqEnd = max(key, refPos) + 100
+        if seqEnd > genomeLen-1:
+            seqEnd = genomeLen-1
+        subSequence = genome[seqStart:seqEnd+1]
+        refIndex = refPos-seqStart
+        queIndex = quePos-seqStart
+        #modify string and then compare
+        refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+        queSequence = modify_sequence(subSequence, queIndex, que_snp)
+        if refSequence.upper() == queSequence.upper():
+            queRemoveList.append(quePos)
+            ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+            query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            output_info = '{},{}'.format(subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+            output.write(match_string)
+            refPos_snp.pop(refPos, None)
+            break
+    output.close()
+    for pos in queRemoveList:
+        match_set.append(pos)
+        quePos_snp.pop(pos, None)
+def powerful_near_search(refPos_snp, quePos_snp, genome, blockSize):
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    for key in quePos_snp:
+        num += 1
+        #print num
+        minPos = max(key-blockSize, 0)
+        maxPos = min(key+blockSize, genomeLen-1)
+        que_snp = quePos_snp[key]
+        quePos = key
+        for (k,v) in refPos_snp.find_range(minPos, maxPos):
+            ref_snp = v
+            refPos = k
+            if ref_snp[0] != que_snp[0]:
+                continue
+            #get the substring
+            seqStart = min(key, refPos)-100
+            if seqStart < 0:
+                seqStart = 0
+            seqEnd = max(key, refPos) + 100
+            if seqEnd > genomeLen-1:
+                seqEnd = genomeLen-1
+            subSequence = genome[seqStart:seqEnd+1]
+            refIndex = refPos-seqStart
+            queIndex = quePos-seqStart
+            #modify string and then compare
+            refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+            queSequence = modify_sequence(subSequence, queIndex, que_snp)
+            if refSequence.upper() == queSequence.upper():
+                queRemoveList.append(quePos)
+                ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+                query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+                output_info = '{},{}'.format(subSequence, refSequence.upper())
+                match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+                #output.write(match_string)
+                refPos_snp.pop(refPos, None)
+                break
+    output.close()
+    for pos in queRemoveList:
+        match_set.append(pos)
+        quePos_snp.pop(pos, None)
+def modify_by_list(pos_snp, posList, sequence, bound):
+    modList = copy.deepcopy(posList)
+    modList.sort(reverse=True)
+    for pos in modList:
+        snp = pos_snp[pos]
+        if len(snp) != 3:
+            print ("Error: snp set size not right.")
+        index = pos-bound
+        ref = snp[1]
+        alt = snp[2]
+        if sequence[index:index+len(ref)].upper() != ref.upper():
+            pass
+        sequence = sequence[:index] + alt + sequence[index+len(ref):]
+    return sequence
+def complex_search(refPos_snp, quePos_snp, genome, rev):
+    #global ref_match_total
+    #global que_match_total
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a+') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    start_position = None
+    for (key, value) in quePos_snp.find_range(None, None):
+        num += 1
+        que_snp = value
+        quePos = key
+        minPos = key
+        maxPos = min(key+len(que_snp[1])-1, genomeLen-1) + 1
+        candidateRefPos = []
+        candidateRefNode = []
+        temp_refPos_snp = {}
+        min_refPos = 3000000000
+        max_refPos = 0
+        for p in refPos_snp.linear_range_search(start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            if min_refPos > k:
+                min_refPos = k
+            if max_refPos < k:
+                max_refPos = k
+            candidateRefNode.append(p)
+            candidateRefPos.append(k)
+            temp_refPos_snp[k] = v
+        #get the substring
+        if len(candidateRefPos) == 0:
+            continue
+        before = refPos_snp.before(candidateRefNode[0])
+        while before is not None and before.key() + len(before.value()[1]) - 1 >= minPos:
+            #print ('find before boundary in stage 2')
+            candidateRefNode.insert(0, before)
+            min_refPos = before.key()
+            candidateRefPos.append(before.key())
+            temp_refPos_snp[before.key()] = before.value()
+            before = refPos_snp.before(candidateRefNode[0])
+        candidateRefPos.sort()
+        seqStart = min(key, min_refPos)-100
+        if seqStart < 0:
+            seqStart = 0
+        seqEnd = max(key, max_refPos) + 100
+        if seqEnd > genomeLen-1:
+            seqEnd = genomeLen-1
+        subSequence = genome[seqStart:seqEnd+1]
+        queIndex = quePos-seqStart
+        #modify string and then compare
+        refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, seqStart)
+        queSequence = modify_sequence(subSequence, queIndex, que_snp)
+        if refSequence.upper() == queSequence.upper():
+            #matched
+            start_position = refPos_snp.after(candidateRefNode[-1])
+            queRemoveList.append(quePos)
+            ref_variants = ''
+            query_variants = ''
+            if not rev:
+                for index in range(len(candidateRefPos)-1):
+                    pos = candidateRefPos[index]
+                    ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                    #ref_match_total.add(pos)
+                    #be sure to recover
+                    refPos_snp.pop(pos)
+                ref_pos = candidateRefPos[-1]
+                ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+                #ref_match_total.add(ref_pos)
+                # be sure to recover
+                refPos_snp.pop(ref_pos)
+                #multi_match_ref += 1
+                query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            else:
+                for index in range(len(candidateRefPos)-1):
+                    pos = candidateRefPos[index]
+                    query_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                    #que_match_total.add(pos)
+                    refPos_snp.pop(pos)
+                ref_pos = candidateRefPos[-1]
+                query_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+                #que_match_total.add(ref_pos)
+                refPos_snp.pop(ref_pos)
+                #multi_match_ref += 1
+                ref_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            output_info = '{},{}'.format(subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+            #output.write(match_string)
+        else:
+            start_position = candidateRefNode[0]
+    output.close()
+    for pos in queRemoveList:
+        #match_set.append(pos)
+        #que_match_total.add(pos)
+        quePos_snp.pop(pos, None)
+def convert_substitution(pos_list, pos_snp, subsequence, low_bound):
+    print ('convert_substitution: unfinished function')
+    return
+    indel_list = []
+    pos_list.sort()
+    for pos in pos_list:
+        variant = pos_snp[pos]
+        relative_pos = pos - low_bound
+        ref = variant[0]
+        alt = variant[1]
+        if len(ref) == len(alt):
+            assert len(ref) == 1, 'snp should be normalized and decomposed.'
+            temp_del = [relative_pos, -1, ref]
+            temp_ins = [relative_pos, 1, ref]
+            indel_list.append(temp_del)
+            indel_list.append(temp_ins)
+        elif len(ref) > len(alt): # deletion
+            del_position = relative_pos + len(ref) - 1
+            for i in range(len(ref)-1):
+                del_n = ref[i+1]
+                temp_del = [del_position, -1, del_n]
+                indel_list.insert(temp_del)
+        elif len(ref) < len(alt): # insertion
+            ins_position = relative_pos + 1
+            for i in range(len(alt)-1):
+                ins_n = alt[i+1]
+                temp_ins = [ins_position, 1, ins_n]
+                indel_list.insert(temp_ins)
+def check_transition_theory(candidateRefPos, candidateQuePos, temp_refPos_snp, temp_quePos_snp, subSequence, lowBound):
+    convert_substitution(candidateRefPos, temp_refPos_snp, subSequence, lowBound)
+def multi_search(refPos_snp, quePos_snp, genome, blockSize):
+    #global ref_match_total
+    #global que_match_total
+    multi_match = 0
+    multi_match_ref = 0
+    multi_match_que = 0
+    one2multi = 0
+    multi2multi = 0
+    genomeLen = len(genome)
+    output = open(args.output, 'a+') #open output file for
+    refPosDelSet = set()
+    quePosDelSet = set()
+    #debug = False
+    ref_start_position = None
+    que_start_position = None
+    for key in quePos_snp.keys()[:]:
+        if not key in quePos_snp:  # logN operation
+            continue
+        candidateRefPos = []
+        candidateQuePos = []
+        candidateRefNode = []
+        candidateQueNode = []
+        minPos = max(key-blockSize, 0)
+        maxPos = min(key+blockSize, genomeLen-1) + 1
+        temp_refPos_snp = {}
+        for p in refPos_snp.linear_range_search(ref_start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            candidateRefNode.append(p)
+            candidateRefPos.append(k)
+            temp_refPos_snp[k] = v
+        temp_quePos_snp = {}
+        for p in quePos_snp.linear_range_search(que_start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            candidateQueNode.append(p)
+            candidateQuePos.append(k)
+            temp_quePos_snp[k] = v
+        if len(candidateQuePos) == 0:
+            print ("Error: query empty")
+            continue
+        if len(candidateRefPos) == 0:
+            continue
+        min_ref_pos = candidateRefPos[0]
+        max_ref_pos = candidateRefPos[-1] + len(temp_refPos_snp[candidateRefPos[-1]][1]) - 1
+        min_que_pos = candidateQuePos[0]
+        max_que_pos = candidateQuePos[-1] + len(temp_quePos_snp[candidateQuePos[-1]][1]) - 1
+        """
+        ref_before = refPos_snp.before(candidateRefNode[0])
+        que_before = quePos_snp.before(candidateQueNode[0])
+        while (ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos) or (que_before is not None and que_before.key() + len(que_before.value()[0])-1 > min_ref_pos):
+            #print ('find before boundary in stage 3')
+            if ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos :
+                candidateRefNode.insert(0, ref_before)
+                min_ref_pos = ref_before.key()
+                candidateRefPos.insert(0, ref_before.key())
+                temp_refPos_snp[ref_before.key()] = ref_before.value()
+                ref_before = refPos_snp.before(candidateRefNode[0])
+            if que_before is not None and que_before.key() + len(que_before.value()[0]) - 1 >= min_ref_pos :
+                candidateQueNode.insert(0, que_before)
+                min_que_pos = que_before.key()
+                candidateQuePos.insert(0, que_before.key())
+                temp_quePos_snp[que_before.key()] = que_before.value()
+                que_before = quePos_snp.before(candidateQueNode[0])
+        ref_after = refPos_snp.after(candidateRefNode[-1])
+        que_after = quePos_snp.after(candidateQueNode[-1])
+        while (ref_after is not None and ref_after.key() <= max_que_pos) or (que_after is not None and que_after.key() <= max_ref_pos):
+            #print ('find after boundary in stage 3')
+            if ref_after is not None and ref_after.key() <= max_que_pos :
+                candidateRefNode.append(ref_after)
+                max_ref_pos = ref_after.key() + len(ref_after.value()[1]) - 1
+                candidateRefPos.append(ref_after.key())
+                temp_refPos_snp[ref_after.key()] = ref_after.value()
+                ref_after = refPos_snp.after(candidateRefNode[-1])
+            if que_after is not None and que_after.key() <= max_ref_pos :
+                candidateQueNode.append(que_after)
+                max_que_pos = que_after.key() + len(que_after.value()[1]) - 1
+                candidateQuePos.append(que_after.key())
+                temp_quePos_snp[que_after.key()] = que_after.value()
+                que_after = quePos_snp.after(candidateQueNode[-1])
+        """
+        lowBound = candidateRefPos[0]
+        upperBound = candidateRefPos[-1]
+        if lowBound > candidateQuePos[0]:
+            lowBound = candidateQuePos[0]
+        if upperBound < candidateQuePos[-1]:
+            upperBound = candidateQuePos[-1]
+        lowBound = max(0, lowBound-100)
+        upperBound = min(upperBound+100, genomeLen-1)
+        subSequence = genome[lowBound: upperBound+1]
+        refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, lowBound)
+        queSequence = modify_by_list(temp_quePos_snp, candidateQuePos, subSequence, lowBound)
+        if refSequence.upper() == queSequence.upper():
+            #print ("multi_search works")
+            ref_start_position = refPos_snp.after(candidateRefNode[-1])
+            que_start_position = quePos_snp.after(candidateQueNode[-1])
+            ref_variants = ''
+            query_variants = ''
+            #check_transition_theory(candidateRefPos, candidateQuePos, temp_refPos_snp, temp_quePos_snp, subSequence, lowBound)
+            for index in range(len(candidateRefPos)-1):
+                pos = candidateRefPos[index]
+                ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                #ref_match_total.add(pos)
+                refPos_snp.pop(pos)
+                multi_match_ref += 1
+            ref_pos = candidateRefPos[-1]
+            ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+            #ref_match_total.add(ref_pos)
+            refPos_snp.pop(ref_pos)
+            multi_match_ref += 1
+            for index in range(len(candidateQuePos)-1):
+                pos = candidateQuePos[index]
+                query_variants += '{},{},{};'.format(pos, temp_quePos_snp[pos][1], temp_quePos_snp[pos][2])
+                #que_match_total.add(pos)
+                quePos_snp.pop(pos)
+                #quePosList.remove(pos)
+                #match_set.append(pos)
+                multi_match_que += 1
+                #quePosDelSet.add(pos)
+            que_pos = candidateQuePos[-1]
+            query_variants += '{},{},{}'.format(que_pos, temp_quePos_snp[que_pos][1], temp_quePos_snp[que_pos][2])
+            #que_match_total.add(que_pos)
+            quePos_snp.pop(que_pos)
+            #quePosList.remove(que_pos)
+            #match_set.append(que_pos)
+            multi_match_que += 1
+            #quePosDelSet.add(que_pos)
+            output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+            #output.write(match_string)
+            multi_match += 1
+            if len(candidateRefPos) == 1 or len(candidateQuePos) == 1:
+                one2multi += 1
+            else:
+                multi2multi += 1
+        else:
+            ref_start_position = candidateRefNode[0]
+            que_start_position = candidateQueNode[0]
+    output.close()
+    #print (multi_match, multi_match_ref, multi_match_que, one2multi, multi2multi)
+def match_by_tuple(ref_choice, que_choice, temp_refPos_snp, temp_quePos_snp, sequence):
+    ref_choice_list = list(ref_choice)
+    que_choice_list = list(que_choice)
+    ref_choice_list.sort()
+    que_choice_list.sort()
+    min_pos = min(ref_choice_list[0], que_choice_list[0])
+    max_pos = max(ref_choice_list[-1], que_choice_list[-1])
+    min_pos -= 100
+    max_pos += 100
+    min_pos = max(0, min_pos)
+    max_pos = min(len(sequence)-1, max_pos)
+    sub_sequence = sequence[min_pos: max_pos+1]
+    ref_sequence = modify_by_list(temp_refPos_snp, ref_choice_list, sub_sequence, min_pos)
+    que_sequence = modify_by_list(temp_quePos_snp, que_choice_list, sub_sequence, min_pos)
+    return ref_sequence.upper() == que_sequence.upper()
+def cluster_search_old(refPos_snp, quePos_snp, data_list, cluster_list, data_list_ref_que_dict, sequence):
+    # refPos_snp and quePos_snp are red_black_tree_map which operates like dictionary expect keys are sorted.
+    # all index less than data_list_index_threshold is in refPos_snp
+    # otherwise, it is in quePos_snp
+    print 'cluster search'
+    cluster_pos = {}
+    for index in range(len(cluster_list)):
+        cluster_id = cluster_list[index]
+        pos = data_list[index]
+        if cluster_id in cluster_pos:
+            cluster_pos[cluster_id].append(pos)
+        else:
+            cluster_pos[cluster_id] = [pos]
+    print 'iterate clusters'
+    for cluster_id in cluster_pos:
+        pos_list = cluster_pos[cluster_id]
+        if len(pos_list) <= 2:
+            continue
+        candidateRefPos = []
+        candidateQuePos = []
+        temp_refPos_snp = {}
+        temp_quePos_snp = {}
+        min_pos = len(sequence) - 1
+        max_pos = 0
+        for temp_pos in pos_list:
+            if data_list_ref_que_dict[temp_pos] > 0:
+                candidateRefPos.append(temp_pos)
+                temp_refPos_snp[temp_pos] = refPos_snp[temp_pos]
+            else:
+                candidateQuePos.append(temp_pos)
+                temp_quePos_snp[temp_pos] = quePos_snp[temp_pos]
+        if len(candidateRefPos) <= 1 or len(candidateQuePos) <= 1:
+            continue
+        # now we have the candidateRefPos and candidateQuePos
+        # next step is to permutate all combinations
+        # rule is that at least one should from each list
+        is_matched = False
+        for i in range(1, len(candidateRefPos), 1):
+            if is_matched:
+                break
+            ref_combination_list = list(itertools.combinations(candidateRefPos, i))
+            for j in range(1, len(candidateQuePos), 1):
+                if is_matched:
+                    break
+                que_combination_list = list(itertools.combinations(candidateQuePos, j))
+                #print ref_combination_list
+                #print que_combination_list
+                for ref_choice in ref_combination_list:
+                    if is_matched:
+                        break
+                    for que_choice in que_combination_list:
+                        is_matched = match_by_tuple(ref_choice, que_choice, temp_refPos_snp, temp_quePos_snp, sequence)
+                        if is_matched:
+                            for pos in ref_choice:
+                                refPos_snp.pop(pos)
+                            for pos in que_choice:
+                                quePos_snp.pop(pos)
+                            break
+# ----------------------------end of cluster_search_old------------------------------------------------
+def cluster_search(refPos_snp, quePos_snp, data_list, cluster_list, sequence):
+    """
+    cluster_search use hash table to reduce running time from 2^(mn) to 2^m + 2^n
+    """
+    # refPos_snp and quePos_snp are red_black_tree_map which operates like dictionary expect keys are sorted.
+    # all index less than data_list_index_threshold is in refPos_snp
+    # otherwise, it is in quePos_snp
+    print 'cluster search'
+    output = open(args.output, 'a') #open output file for
+    cluster_data = {}
+    for index in range(len(cluster_list)):
+        cluster_id = cluster_list[index]
+        data = data_list[index]
+        #print data, cluster_id
+        if cluster_id in cluster_data:
+            cluster_data[cluster_id].append(data)
+        else:
+            cluster_data[cluster_id] = [data]
+    print 'iterate clusters'
+    #print cluster_data[1222]
+    cluster_num = 0
+    for cluster_id in sorted(cluster_data):
+        cluster_num += 1
+        #print cluster_id
+        data_list = cluster_data[cluster_id]
+        if len(data_list) <= 2:
+            continue
+        candidateRefPos = []
+        candidateQuePos = []
+        temp_refPos_snp = {}
+        temp_quePos_snp = {}
+        min_pos = data_list[0][0]
+        max_pos = data_list[-1][0]
+        min_pos -= 100
+        max_pos += 100
+        min_pos = max(0, min_pos)
+        max_pos = min(max_pos, len(sequence)-1)
+        sub_sequence = sequence[min_pos: max_pos+1]
+        for temp_data in data_list:
+            temp_pos = temp_data[0]
+            if temp_data[1] > 0:
+                candidateRefPos.append(temp_pos)
+                temp_refPos_snp[temp_pos] = refPos_snp[temp_pos]
+            else:
+                candidateQuePos.append(temp_pos)
+                temp_quePos_snp[temp_pos] = quePos_snp[temp_pos]
+        if len(candidateRefPos) <= 1 and len(candidateQuePos) <= 1:
+            continue
+        candidateRefPos.sort()
+        candidateQuePos.sort()
+        # now we have the candidateRefPos and candidateQuePos
+        # next step is to permutate all combinations
+        # rule is that should pick at least one from each list
+        #if cluster_id == 1222:
+        #    print candidateRefPos, candidateQuePos
+        ref_sequence_choice = {}
+        ref_pos_del_set = set()
+        que_pos_del_set = set()
+        for i in range(1, len(candidateRefPos)+1, 1):
+            ref_combination_list = list(itertools.combinations(candidateRefPos, i))
+            for ref_combination in ref_combination_list:
+                ref_choice_list = list(ref_combination)
+                ref_sequence = modify_by_list(temp_refPos_snp, ref_choice_list, sub_sequence, min_pos)
+                ref_sequence = ref_sequence.upper()
+                ref_sequence_choice[ref_sequence] = ref_choice_list
+        for j in range(1, len(candidateQuePos)+1, 1):
+            que_combination_list = list(itertools.combinations(candidateQuePos, j))
+            for que_combination in que_combination_list:
+                que_choice_list = list(que_combination)
+                que_sequence = modify_by_list(temp_quePos_snp, que_choice_list, sub_sequence, min_pos)
+                que_sequence = que_sequence.upper()
+                if que_sequence in ref_sequence_choice:
+                    ref_choice_list = ref_sequence_choice[que_sequence]
+                    #print 'matched', ref_choice_list, que_choice_list
+                    for pos in ref_choice_list:
+                        ref_pos_del_set.add(pos)
+                    for pos in que_choice_list:
+                        que_pos_del_set.add(pos)
+        #===============================output matching results========================================
+        if len(ref_pos_del_set) == 0 or len(que_pos_del_set) == 0:
+            continue
+        ref_pos_del_list = list(ref_pos_del_set)
+        que_pos_del_list = list(que_pos_del_set)
+        ref_variants = ""
+        for index in range(len(ref_pos_del_list)-1):
+            pos = ref_pos_del_list[index]
+            ref_variants += '{},{},{};'.format(pos, refPos_snp[pos][1], refPos_snp[pos][2])
+            refPos_snp.pop(pos)
+        ref_pos = ref_pos_del_list[-1]
+        ref_variants += '{},{},{}'.format(ref_pos, refPos_snp[ref_pos][1], refPos_snp[ref_pos][2])
+        refPos_snp.pop(ref_pos)
+        query_variants = ""
+        for index in range(len(que_pos_del_list)-1):
+            pos = que_pos_del_list[index]
+            query_variants += '{},{},{};'.format(pos, quePos_snp[pos][1], quePos_snp[pos][2])
+            quePos_snp.pop(pos)
+        que_pos = que_pos_del_list[-1]
+        query_variants += '{},{},{}'.format(que_pos, quePos_snp[que_pos][1], quePos_snp[que_pos][2])
+        quePos_snp.pop(que_pos)
+        #output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+        output_info = '{},{},{}'.format(min_pos, max_pos, sub_sequence)
+        match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+        output.write(match_string)
+        #==============================output matching results========================================
+        #for pos in ref_pos_del_set:
+            #refPos_snp.pop(pos)
+        #for pos in que_pos_del_set:
+            #quePos_snp.pop(pos)
+    output.close()
+def report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum):
+    positiveFile = open(args.false_positive, "a+")
+    negativeFile = open(args.false_negative, "a+")
+    #query_mismatch_file = open(args.false_positive, 'w')
+    #ref_mismatch_file = open(args.false_negative, 'w')
+    #true_pos_file = open(args.true_positive, 'w')
+    refList = list(refPos_snp.keys())
+    refList.sort()
+    for pos in refList:
+        #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+        s = refPos_vcfEntry[pos] + '\n'
+        negativeFile.write(s)
+    negativeFile.close()
+    queList = list(quePos_snp.keys())
+    queList.sort()
+    for pos in queList:
+        #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+        s = quePos_vcfEntry[pos] + '\n'
+        positiveFile.write(s)
+    positiveFile.close()
+    #match_set.sort()
+    #for pos in match_set:
+    #    s = args.chr + '\t' + str(pos) + '\t' + str(pos+1) + '\n'
+    #    true_pos_file.write(s)
+    #true_pos_file.close()
+    print ('\n######### Matching Result ################\n')
+    print (' ref total: {}\n que total: {}\n ref matches: {}\n que '\
+           'matches: {}\n ref mismatch: {}\n alt mismatch: {}\n'.format(\
+            refOriginalNum, queOriginalNum,refOriginalNum-len(refPos_snp),\
+            queOriginalNum-len(quePos_snp) , len(refPos_snp), len(quePos_snp)))
+    stat_file = open(args.stat, 'a+')
+    stat_file.write('{}\t{}\t{}\t{}\t{}\n'.format(args.chr, refOriginalNum,\
+                    queOriginalNum, refOriginalNum-len(refPos_snp), \
+                    queOriginalNum-len(quePos_snp)))
+    stat_file.close()
+    #print (len(ref_match_total), len(que_match_total))
+    #print multi_match, multi_match_ref, multi_match_que
+# check if sequence is exactly a tandem repeat
+def check_tandem_repeat(sequence):
+    sequence_length = len(sequence)
+    end_index = sequence_length / 2 + 1
+    final_checking = False
+    for repeat_length in range(1, end_index, 1):
+        #if sequence_length % repeat_length != 0:
+        #    continue
+        is_tandem_repeat = True
+        repeat_region = sequence[:repeat_length]
+        start_position = repeat_length
+        while(start_position < len(sequence)):
+            if start_position + repeat_length > sequence_length:
+                break
+            matching_region = sequence[start_position: start_position + repeat_length]
+            if matching_region != repeat_region:
+                is_tandem_repeat = False
+                break
+            start_position += repeat_length
+        if is_tandem_repeat:
+            final_checking = True
+            break
+    return final_checking
+# employ hierarchical clustering, since data is only one dimensional, just check distance.
+# add lower bound to this clustering strategy, if distance larger than lower bound
+# , check if the sequence between two variant is repeat region (or so called tandem repeat)
+# using the program wrote for tandem repeat prediction.
+def clustering_snp(data_list, cluster_list, threshold, reference, lower_bound, refPos_snp, quePos_snp):
+    #r = refPos_snp[175243825]
+    #print r, r[0]
+    #q = quePos_snp[175243826]
+    #print q, q[0]
+    if len(data_list) < 1:
+        return
+    cluster_index = 0
+    previous_data = 0
+    for i in range(len(data_list)):
+        distance = data_list[i][0] - previous_data
+        #if data_list[i][0] == 175243838:
+        #    print '@@@@@@@@@', previous_data, distance
+        if distance > threshold:
+            cluster_index += 1
+        else:
+            if distance > lower_bound:
+                subsequence = reference[previous_data: data_list[i][0]]
+                if not check_tandem_repeat(subsequence):
+                    cluster_index += 1
+        cluster_list.append(cluster_index)
+        current_data = data_list[i][0]
+        data_source = data_list[i][1]
+        data_length = 1
+        if data_source > 0:
+            data_length = len(refPos_snp[current_data][1])
+        else:
+            data_length = len(quePos_snp[current_data][1])
+        current_data += data_length
+        if previous_data < current_data:
+            previous_data = current_data
+def main():
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.reference):
+        print ("Error: reference file not found")
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.query):
+        print ("Error: query vcf file not found.")
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.genome):
+        print("Error: genome file not found.")
+        parser.print_help()
+        sys.exit()
+    report_head = '##genome=' + args.genome + '\n'
+    report_head += '##ref=' + args.reference + '\n'
+    report_head += '##query=' + args.query + '\n'
+    report_head += '##chr_name=chromosome name of this data\n'
+    report_head += '##ref_variant=matched variant from reference set\n'
+    report_head += '##query_variant=matched variants from query set, corresponding to ref_variant\n'
+    report_head += '##variants in both ref_variants and query_variants are separated by ";"\n'
+    report_head += '##each variant is a tuple<POS,REF,ALT> separated by ",", POS is 0-based position, REF is sequence in reference genome, ALT is corresponding allele in donor genome\n'
+    report_head += '##info=matching information, if directly matched, there will be "."; if >1 variants in ref_variants or query_variants, info will be subsequence from genome, and the modified subsequence by ref_variants and query_variants\n'
+    report_head += '#chr_name\tref_variants\tquery_variants\tinfo\n'
+    with open(args.output, 'w') as output:
+        output.write(report_head)
+    sequence = ""
+    print ('read genome file...')
+    seqFile = open(args.genome)
+    for line in seqFile.readlines():
+        if line.startswith(">"):
+            continue
+        line = line.strip()
+        sequence += line
+    seqFile.close()
+    ref_mismatch_file = open(args.false_negative, 'w')
+    print ('read reference vcf file...')
+    hash_refPos_snp = {}
+    #refPos_snp = RedBlackTreeMap()
+    refFile = open(args.reference)
+    for line in refFile.readlines():
+        if line.startswith("#"):
+            ref_mismatch_file.write(line)
+            continue
+        line = line.strip()
+        columns = line.split("\t")
+        pos = int(columns[1])-1
+        ref = columns[3]
+        alt = columns[4]
+        quality = columns[6]
+        if ',' in alt:
+            continue
+        snpType = 'S'
+        if len(ref) > len(alt):
+            snpType = 'D'
+        elif len(ref) < len(alt):
+            snpType = 'I'
+        #print pos, snpType, ref, alt
+        hash_refPos_snp[pos] = [snpType, ref, alt]
+        refPos_vcfEntry[pos] = line
+        #refPos_quality[pos] = quality
+    refFile.close()
+    ref_mismatch_file.close()
+    que_mismatch_file = open(args.false_positive, 'w')
+    print ('read query vcf file...')
+    hash_quePos_snp = {}
+    #quePos_snp = RedBlackTreeMap()
+    queFile = open(args.query)
+    for line in queFile.readlines():
+        if line.startswith("#"):
+            que_mismatch_file.write(line)
+            continue
+        line = line.strip()
+        columns = line.split("\t")
+        pos = int(columns[1])-1
+        ref = columns[3]
+        alt = columns[4]
+        if ',' in alt:
+            continue
+        snpType = 'S'
+        if len(ref) > len(alt):
+            snpType = 'D'
+        elif len(ref) < len(alt):
+            snpType = 'I'
+        hash_quePos_snp[pos] = [snpType, ref, alt]
+        quePos_vcfEntry[pos] = line
+    queFile.close()
+    que_mismatch_file.close()
+    refOriginalNum = len(hash_refPos_snp)
+    queOriginalNum = len(hash_quePos_snp)
+    print ('first stage start...')
+    if refOriginalNum > 0 and queOriginalNum > 0:
+        direct_search(hash_refPos_snp, hash_quePos_snp)
+    #print ("after direct search: ", len(hash_refPos_snp), len(hash_quePos_snp))
+    if args.direct_search:
+        report(hash_refPos_snp, hash_quePos_snp, refOriginalNum, queOriginalNum)
+        return
+    refPos_snp = RedBlackTreeMap()
+    quePos_snp = RedBlackTreeMap()
+    for k in hash_refPos_snp:
+        refPos_snp[k] = hash_refPos_snp[k]
+    for k in hash_quePos_snp:
+        quePos_snp[k] = hash_quePos_snp[k]
+    print ('second stage start...')
+    if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+        complex_search(refPos_snp, quePos_snp, sequence, False)
+    if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+        complex_search(quePos_snp, refPos_snp, sequence, True)
+    #print ("after complex search:", len(refPos_snp), len(quePos_snp))
+#=====================windows stage=============================================
+    '''
+    print ('third stage start...')
+    for block_size in [2, 4, 5,10,20,50,100,200]:
+        print ('try window size ' + str(block_size*2) + '...')
+        if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+            multi_search(refPos_snp, quePos_snp, sequence, block_size)
+        #print ('after multi search in ' + str(block_size) + ' bp range:', len(refPos_snp), len(quePos_snp))
+    report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+    '''
+#=====================windows stage============================================
+#=====================clustering stage=========================================
+    print ('start clustering...')
+    data_list = []
+    data_list_index_ref_que_dict = {}
+    #data_list_index_threshold = 0
+    for pos in refPos_snp:
+        pos_list = [pos, 1]
+        data_list.append(pos_list)
+        #data_list_index_ref_que_dict[pos] = 1
+        #data_list_index_threshold += 1
+    for pos in quePos_snp:
+        pos_list = [pos, -1]
+        data_list.append(pos_list)
+        #data_list_index_ref_que_dict[pos] = -1
+        #data_list_index += 1
+    data_list.sort()
+    #print data_list
+    cluster_list = []
+    #print data_list
+    #data = numpy.asarray(data_list)
+    thresh = 400
+    lower_bound = 10
+    clustering_snp(data_list, cluster_list, thresh, sequence, lower_bound, refPos_snp, quePos_snp)
+    #for i in range(len(data_list)):
+    #    print data_list[i], cluster_list[i]
+    #print 'clustring...'
+    #clusters = hcluster.fclusterdata(data, thresh)
+    #print 'finish clustering'
+    #cluster_list = clusters.tolist()
+    #print 'finish to list'
+    # this is for verify the cluster results, make sure all pos in short distance is in a cluster.
+    """
+    previous_class = -1
+    previous_coordinate = -100000
+    min_distance = 100000
+    for k in range(len(cluster_list)):
+        if cluster_list[k] != previous_class:
+            current_distance = data_list[k][0] - previous_coordinate
+            if current_distance < min_distance:
+                min_distance = current_distance
+        previous_class = cluster_list[k]
+        previous_coordinate = data_list[k][0]
+    print ('end clustering...')
+    print ('min distance between clusters:', min_distance)
+    print ('number of clusters:', len(cluster_list))
+    """
+    cluster_search(refPos_snp, quePos_snp, data_list, cluster_list, sequence)
+    report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+#======================clustering stage==================================================
+    #exit()
+    '''
+    for i in range(len(data_list)):
+        data = data_list[i]
+        cluster = cluster_list[i]
+        pos = data[0]
+        category = data[1]
+        if category > 0:
+            if pos in refPos_snp:
+                print pos, cluster
+        else:
+            if pos in quePos_snp:
+                print pos, cluster
+    '''
+def test():
+    print check_tandem_repeat('AAACCAAAACCC')
+    print check_tandem_repeat('AAAAAAAAA')
+if __name__ == '__main__':
+    main()
diff --git a/py/vcfcompare_backup.py b/py/vcfcompare_backup.py
new file mode 100644
index 0000000..b3bd140
--- /dev/null
+++ b/py/vcfcompare_backup.py
@@ -0,0 +1,677 @@
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Author: Chen Sun(chensun at cse.psu.edu)
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+#elif sys.hexversion > 0x03000000:
+#    print ("python 3")
+import subprocess
+import argparse
+import os
+import copy
+from lib.red_black_tree import RedBlackTreeMap
+citation = 'About algorithm used in VCF-Compare, please refer to "Method for Cross-Validating Variant Call Set" Section in our paper.'+'\n Please cite our paper.'
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-r', '--reference', required=True, help = 'reference vcf file path, usually larger than query vcf file')
+parser.add_argument('-q', '--query', required=True, help = 'query vcf file path')
+parser.add_argument('-g', '--genome', required=True, help= 'reference genome file path, fasta file format')
+parser.add_argument('-p', '--false_positive', help='false positive, i.e. mismatch vcf entries in query vcf file, default=false_positive.vcf', default='false_positive.vcf')
+parser.add_argument('-n', '--false_negative', help='false negative, i.e. mismatch vcf entries in reference vcf file, default=false_negative.vcf', default='false_negative.vcf')
+#parser.add_argument('-t', '--true_positive', help='true positive bed file position', default='true_positive.bed')
+parser.add_argument('-o', '--output', help='output matched variants in stage 2 and 3, default=multi_match.out', default='multi_match.out')
+parser.add_argument('-d', '--direct_search', help='if activate, only perform stage 1, default=not activate', action = 'store_true')
+parser.add_argument('-c', '--chr', help='chromosome name or id, used for parallel multi genome analysis', default='.')
+parser.add_argument('-s', '--stat', help='append statistics result into a file, useful for parallel multi genome analysis', default='stat.txt')
+args = parser.parse_args()
+#match_set = []
+#matched_quality_set = []
+#refPos_quality = {}
+######################### for debug ###########################
+refPos_vcfEntry = {}
+quePos_vcfEntry = {}
+#ref_match_total = set()
+#que_match_total = set()
+def direct_search(refPos_snp, quePos_snp):
+    global ref_match_total
+    global que_match_total
+    delList = []
+    num = 0
+    for key in quePos_snp:
+        if key in refPos_snp:
+            if refPos_snp[key] == quePos_snp[key]:
+                delList.append(key)
+                #match_set.append(key)
+                num += 1
+                #ref_match_total.add(key)
+                #que_match_total.add(key)
+    match_file = open('direct_search.txt', 'w')
+    for key in delList:
+        match_string = str(key) + ',' + str(refPos_snp[key]) + '\t' + str(key) + ',' + str(quePos_snp[key]) + '\n'
+        match_file.write(match_string)
+        #matched_quality_set.append(refPos_quality[key])
+        refPos_snp.pop(key, None) # delete value with key
+        quePos_snp.pop(key, None)
+    match_file.close()
+    #with open('matched_quality.txt', 'w') as quality:
+    #    for q in matched_quality_set:
+    #        quality.write(str(q)+'\n')
+    #print ("direct search found:", num)
+def modify_sequence(sequence, pos, snpSet):
+    if len(snpSet) != 3:
+        print ("Error: snp set size not right.")
+    ref = snpSet[1]
+    alt = snpSet[2]
+    if sequence[pos:pos+len(ref)].upper() != ref.upper():
+        pass
+    result = sequence[:pos] + alt + sequence[pos+len(ref):]
+    return result
+def near_search(refPos_snp, quePos_snp, genome, blockSize):
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    for key in quePos_snp:
+        num += 1
+        ref_element = refPos_snp.find_nearest(key) # return a position
+        ref_snp = ref_element.value()
+        que_snp = quePos_snp[key]
+        refPos = ref_element.key()
+        quePos = key
+        if abs(refPos-key) > blockSize:
+            continue
+        if ref_snp[0] != que_snp[0]:
+            continue
+        #get the substring
+        seqStart = min(key, refPos)-100
+        if seqStart < 0:
+            seqStart = 0
+        seqEnd = max(key, refPos) + 100
+        if seqEnd > genomeLen-1:
+            seqEnd = genomeLen-1
+        subSequence = genome[seqStart:seqEnd+1]
+        refIndex = refPos-seqStart
+        queIndex = quePos-seqStart
+        #modify string and then compare
+        refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+        queSequence = modify_sequence(subSequence, queIndex, que_snp)
+        if refSequence.upper() == queSequence.upper():
+            queRemoveList.append(quePos)
+            ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+            query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            output_info = '{},{}'.format(subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+            output.write(match_string)
+            refPos_snp.pop(refPos, None)
+            break
+    output.close()
+    for pos in queRemoveList:
+        match_set.append(pos)
+        quePos_snp.pop(pos, None)
+def powerful_near_search(refPos_snp, quePos_snp, genome, blockSize):
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    for key in quePos_snp:
+        num += 1
+        #print num
+        minPos = max(key-blockSize, 0)
+        maxPos = min(key+blockSize, genomeLen-1)
+        que_snp = quePos_snp[key]
+        quePos = key
+        for (k,v) in refPos_snp.find_range(minPos, maxPos):
+            ref_snp = v
+            refPos = k
+            if ref_snp[0] != que_snp[0]:
+                continue
+            #get the substring
+            seqStart = min(key, refPos)-100
+            if seqStart < 0:
+                seqStart = 0
+            seqEnd = max(key, refPos) + 100
+            if seqEnd > genomeLen-1:
+                seqEnd = genomeLen-1
+            subSequence = genome[seqStart:seqEnd+1]
+            refIndex = refPos-seqStart
+            queIndex = quePos-seqStart
+            #modify string and then compare
+            refSequence = modify_sequence(subSequence, refIndex, ref_snp)
+            queSequence = modify_sequence(subSequence, queIndex, que_snp)
+            if refSequence.upper() == queSequence.upper():
+                queRemoveList.append(quePos)
+                ref_variants = '{},{},{}'.format(refPos, ref_snp[1], ref_snp[2])
+                query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+                output_info = '{},{}'.format(subSequence, refSequence.upper())
+                match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+                output.write(match_string)
+                refPos_snp.pop(refPos, None)
+                break
+    output.close()
+    for pos in queRemoveList:
+        match_set.append(pos)
+        quePos_snp.pop(pos, None)
+def modify_by_list(pos_snp, posList, sequence, bound):
+    modList = copy.deepcopy(posList)
+    modList.sort(reverse=True)
+    for pos in modList:
+        snp = pos_snp[pos]
+        if len(snp) != 3:
+            print ("Error: snp set size not right.")
+        index = pos-bound
+        ref = snp[1]
+        alt = snp[2]
+        if sequence[index:index+len(ref)].upper() != ref.upper():
+            pass
+        sequence = sequence[:index] + alt + sequence[index+len(ref):]
+    return sequence
+def complex_search(refPos_snp, quePos_snp, genome, rev):
+    #global ref_match_total
+    #global que_match_total
+    queRemoveList = [] # record quePos that should be deleted
+    genomeLen = len(genome) # record genome length
+    output = open(args.output, 'a+') #open output file for
+    if refPos_snp is None:
+        print ("Error: refPos_snp is None")
+    if quePos_snp is None:
+        print ("Error: quePos_snp is None")
+    num = 0
+    start_position = None
+    for (key, value) in quePos_snp.find_range(None, None):
+        num += 1
+        que_snp = value
+        quePos = key
+        minPos = key
+        maxPos = min(key+len(que_snp[1])-1, genomeLen-1) + 1
+        candidateRefPos = []
+        candidateRefNode = []
+        temp_refPos_snp = {}
+        min_refPos = 3000000000
+        max_refPos = 0
+        for p in refPos_snp.linear_range_search(start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            if min_refPos > k:
+                min_refPos = k
+            if max_refPos < k:
+                max_refPos = k
+            candidateRefNode.append(p)
+            candidateRefPos.append(k)
+            temp_refPos_snp[k] = v
+        #get the substring
+        if len(candidateRefPos) == 0:
+            continue
+        before = refPos_snp.before(candidateRefNode[0])
+        while before is not None and before.key() + len(before.value()[1]) - 1 >= minPos:
+            #print ('find before boundary in stage 2')
+            candidateRefNode.insert(0, before)
+            min_refPos = before.key()
+            candidateRefPos.append(before.key())
+            temp_refPos_snp[before.key()] = before.value()
+            before = refPos_snp.before(candidateRefNode[0])
+        candidateRefPos.sort()
+        seqStart = min(key, min_refPos)-100
+        if seqStart < 0:
+            seqStart = 0
+        seqEnd = max(key, max_refPos) + 100
+        if seqEnd > genomeLen-1:
+            seqEnd = genomeLen-1
+        subSequence = genome[seqStart:seqEnd+1]
+        queIndex = quePos-seqStart
+        #modify string and then compare
+        refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, seqStart)
+        queSequence = modify_sequence(subSequence, queIndex, que_snp)
+        if refSequence.upper() == queSequence.upper():
+            #matched
+            start_position = refPos_snp.after(candidateRefNode[-1])
+            queRemoveList.append(quePos)
+            ref_variants = ''
+            query_variants = ''
+            if not rev:
+                for index in range(len(candidateRefPos)-1):
+                    pos = candidateRefPos[index]
+                    ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                    #ref_match_total.add(pos)
+                    #be sure to recover
+                    refPos_snp.pop(pos)
+                ref_pos = candidateRefPos[-1]
+                ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+                #ref_match_total.add(ref_pos)
+                # be sure to recover
+                refPos_snp.pop(ref_pos)
+                #multi_match_ref += 1
+                query_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            else:
+                for index in range(len(candidateRefPos)-1):
+                    pos = candidateRefPos[index]
+                    query_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                    #que_match_total.add(pos)
+                    refPos_snp.pop(pos)
+                ref_pos = candidateRefPos[-1]
+                query_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+                #que_match_total.add(ref_pos)
+                refPos_snp.pop(ref_pos)
+                #multi_match_ref += 1
+                ref_variants = '{},{},{}'.format(quePos, que_snp[1], que_snp[2])
+            output_info = '{},{}'.format(subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t.\n'.format(ref_variants, query_variants)
+            output.write(match_string)
+        else:
+            start_position = candidateRefNode[0]
+    output.close()
+    for pos in queRemoveList:
+        #match_set.append(pos)
+        #que_match_total.add(pos)
+        quePos_snp.pop(pos, None)
+def multi_search(refPos_snp, quePos_snp, genome, blockSize):
+    #global ref_match_total
+    #global que_match_total
+    multi_match = 0
+    multi_match_ref = 0
+    multi_match_que = 0
+    one2multi = 0
+    multi2multi = 0
+    genomeLen = len(genome)
+    output = open(args.output, 'a+') #open output file for
+    refPosDelSet = set()
+    quePosDelSet = set()
+    #debug = False
+    ref_start_position = None
+    que_start_position = None
+    for key in quePos_snp.keys()[:]:
+        if not key in quePos_snp:  # logN operation
+            continue
+        candidateRefPos = []
+        candidateQuePos = []
+        candidateRefNode = []
+        candidateQueNode = []
+        minPos = max(key-blockSize, 0)
+        maxPos = min(key+blockSize, genomeLen-1) + 1
+        temp_refPos_snp = {}
+        for p in refPos_snp.linear_range_search(ref_start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            candidateRefNode.append(p)
+            candidateRefPos.append(k)
+            temp_refPos_snp[k] = v
+        temp_quePos_snp = {}
+        for p in quePos_snp.linear_range_search(que_start_position, minPos, maxPos):
+            k = p.key()
+            v = p.value()
+            candidateQueNode.append(p)
+            candidateQuePos.append(k)
+            temp_quePos_snp[k] = v
+        if len(candidateQuePos) == 0:
+            print ("Error: query empty")
+            continue
+        if len(candidateRefPos) == 0:
+            continue
+        min_ref_pos = candidateRefPos[0]
+        max_ref_pos = candidateRefPos[-1] + len(temp_refPos_snp[candidateRefPos[-1]][1]) - 1
+        min_que_pos = candidateQuePos[0]
+        max_que_pos = candidateQuePos[-1] + len(temp_quePos_snp[candidateQuePos[-1]][1]) - 1
+        """
+        ref_before = refPos_snp.before(candidateRefNode[0])
+        que_before = quePos_snp.before(candidateQueNode[0])
+        while (ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos) or (que_before is not None and que_before.key() + len(que_before.value()[0])-1 > min_ref_pos):
+            #print ('find before boundary in stage 3')
+            if ref_before is not None and ref_before.key() + len(ref_before.value()[0]) - 1 >= min_que_pos :
+                candidateRefNode.insert(0, ref_before)
+                min_ref_pos = ref_before.key()
+                candidateRefPos.insert(0, ref_before.key())
+                temp_refPos_snp[ref_before.key()] = ref_before.value()
+                ref_before = refPos_snp.before(candidateRefNode[0])
+            if que_before is not None and que_before.key() + len(que_before.value()[0]) - 1 >= min_ref_pos :
+                candidateQueNode.insert(0, que_before)
+                min_que_pos = que_before.key()
+                candidateQuePos.insert(0, que_before.key())
+                temp_quePos_snp[que_before.key()] = que_before.value()
+                que_before = quePos_snp.before(candidateQueNode[0])
+        ref_after = refPos_snp.after(candidateRefNode[-1])
+        que_after = quePos_snp.after(candidateQueNode[-1])
+        while (ref_after is not None and ref_after.key() <= max_que_pos) or (que_after is not None and que_after.key() <= max_ref_pos):
+            #print ('find after boundary in stage 3')
+            if ref_after is not None and ref_after.key() <= max_que_pos :
+                candidateRefNode.append(ref_after)
+                max_ref_pos = ref_after.key() + len(ref_after.value()[1]) - 1
+                candidateRefPos.append(ref_after.key())
+                temp_refPos_snp[ref_after.key()] = ref_after.value()
+                ref_after = refPos_snp.after(candidateRefNode[-1])
+            if que_after is not None and que_after.key() <= max_ref_pos :
+                candidateQueNode.append(que_after)
+                max_que_pos = que_after.key() + len(que_after.value()[1]) - 1
+                candidateQuePos.append(que_after.key())
+                temp_quePos_snp[que_after.key()] = que_after.value()
+                que_after = quePos_snp.after(candidateQueNode[-1])
+        """
+        lowBound = candidateRefPos[0]
+        upperBound = candidateRefPos[-1]
+        if lowBound > candidateQuePos[0]:
+            lowBound = candidateQuePos[0]
+        if upperBound < candidateQuePos[-1]:
+            upperBound = candidateQuePos[-1]
+        lowBound = max(0, lowBound-100)
+        upperBound = min(upperBound+100, genomeLen-1)
+        subSequence = genome[lowBound: upperBound+1]
+        refSequence = modify_by_list(temp_refPos_snp, candidateRefPos, subSequence, lowBound)
+        queSequence = modify_by_list(temp_quePos_snp, candidateQuePos, subSequence, lowBound)
+        if refSequence.upper() == queSequence.upper():
+            #print ("multi_search works")
+            ref_start_position = refPos_snp.after(candidateRefNode[-1])
+            que_start_position = quePos_snp.after(candidateQueNode[-1])
+            ref_variants = ''
+            query_variants = ''
+            for index in range(len(candidateRefPos)-1):
+                pos = candidateRefPos[index]
+                ref_variants += '{},{},{};'.format(pos, temp_refPos_snp[pos][1], temp_refPos_snp[pos][2])
+                #ref_match_total.add(pos)
+                refPos_snp.pop(pos)
+                multi_match_ref += 1
+            ref_pos = candidateRefPos[-1]
+            ref_variants += '{},{},{}'.format(ref_pos, temp_refPos_snp[ref_pos][1], temp_refPos_snp[ref_pos][2])
+            #ref_match_total.add(ref_pos)
+            refPos_snp.pop(ref_pos)
+            multi_match_ref += 1
+            for index in range(len(candidateQuePos)-1):
+                pos = candidateQuePos[index]
+                query_variants += '{},{},{};'.format(pos, temp_quePos_snp[pos][1], temp_quePos_snp[pos][2])
+                #que_match_total.add(pos)
+                quePos_snp.pop(pos)
+                #quePosList.remove(pos)
+                #match_set.append(pos)
+                multi_match_que += 1
+                #quePosDelSet.add(pos)
+            que_pos = candidateQuePos[-1]
+            query_variants += '{},{},{}'.format(que_pos, temp_quePos_snp[que_pos][1], temp_quePos_snp[que_pos][2])
+            #que_match_total.add(que_pos)
+            quePos_snp.pop(que_pos)
+            #quePosList.remove(que_pos)
+            #match_set.append(que_pos)
+            multi_match_que += 1
+            #quePosDelSet.add(que_pos)
+            output_info = '{},{},{},{},{}'.format(blockSize, lowBound, upperBound+1, subSequence, refSequence.upper())
+            match_string = '.\t{}\t{}\t{}\n'.format(ref_variants, query_variants, output_info)
+            output.write(match_string)
+            multi_match += 1
+            if len(candidateRefPos) == 1 or len(candidateQuePos) == 1:
+                one2multi += 1
+            else:
+                multi2multi += 1
+        else:
+            ref_start_position = candidateRefNode[0]
+            que_start_position = candidateQueNode[0]
+    output.close()
+    #print (multi_match, multi_match_ref, multi_match_que, one2multi, multi2multi)
+def report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum):
+    positiveFile = open(args.false_positive, "a+")
+    negativeFile = open(args.false_negative, "a+")
+    #query_mismatch_file = open(args.false_positive, 'w')
+    #ref_mismatch_file = open(args.false_negative, 'w')
+    #true_pos_file = open(args.true_positive, 'w')
+    refList = list(refPos_snp.keys())
+    refList.sort()
+    for pos in refList:
+        #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+        s = refPos_vcfEntry[pos] + '\n'
+        negativeFile.write(s)
+    negativeFile.close()
+    queList = list(quePos_snp.keys())
+    queList.sort()
+    for pos in queList:
+        #s = args.chr + "\t" + str(pos) + "\t" + str(pos+1) + "\n"
+        s = quePos_vcfEntry[pos] + '\n'
+        positiveFile.write(s)
+    positiveFile.close()
+    #match_set.sort()
+    #for pos in match_set:
+    #    s = args.chr + '\t' + str(pos) + '\t' + str(pos+1) + '\n'
+    #    true_pos_file.write(s)
+    #true_pos_file.close()
+    print ('\n######### Matching Result ################\n')
+    print (' ref total: {}\n que total: {}\n ref matches: {}\n que matches: {}\n ref mismatch: {}\n alt mismatch: {}\n'.format(refOriginalNum, queOriginalNum,refOriginalNum-len(refPos_snp), queOriginalNum-len(quePos_snp) , len(refPos_snp), len(quePos_snp)))
+    stat_file = open(args.stat, 'a+')
+    stat_file.write('{}\t{}\t{}\t{}\t{}\n'.format(args.chr, refOriginalNum, queOriginalNum, refOriginalNum-len(refPos_snp), queOriginalNum-len(quePos_snp)))
+    stat_file.close()
+    #print (len(ref_match_total), len(que_match_total))
+    #print multi_match, multi_match_ref, multi_match_que
+def main():
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.reference):
+        print ("Error: reference file not found")
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.query):
+        print ("Error: query vcf file not found.")
+        parser.print_help()
+        sys.exit()
+    if not os.path.isfile(args.genome):
+        print("Error: genome file not found.")
+        parser.print_help()
+        sys.exit()
+    report_head = '##genome=' + args.genome + '\n'
+    report_head += '##ref=' + args.reference + '\n'
+    report_head += '##query=' + args.query + '\n'
+    report_head += '##chr_name=chromosome name of this data\n'
+    report_head += '##ref_variant=matched variant from reference set\n'
+    report_head += '##query_variant=matched variants from query set, corresponding to ref_variant\n'
+    report_head += '##variants in both ref_variants and query_variants are separated by ";"\n'
+    report_head += '##each variant is a tuple<POS,REF,ALT> separated by ",", POS is 0-based position, REF is sequence in reference genome, ALT is corresponding allele in donor genome\n'
+    report_head += '##info=matching information, if directly matched, there will be "."; if >1 variants in ref_variants or query_variants, info will be subsequence from genome, and the modified subsequence by ref_variants and query_variants\n'
+    report_head += '#chr_name\tref_variants\tquery_variants\tinfo\n'
+    with open(args.output, 'w') as output:
+        output.write(report_head)
+    sequence = ""
+    print ('read genome file...')
+    seqFile = open(args.genome)
+    for line in seqFile.readlines():
+        if line.startswith(">"):
+            continue
+        line = line.strip()
+        sequence += line
+    seqFile.close()
+    ref_mismatch_file = open(args.false_negative, 'w')
+    print ('read reference vcf file...')
+    hash_refPos_snp = {}
+    #refPos_snp = RedBlackTreeMap()
+    refFile = open(args.reference)
+    for line in refFile.readlines():
+        if line.startswith("#"):
+            ref_mismatch_file.write(line)
+            continue
+        line = line.strip()
+        columns = line.split("\t")
+        pos = int(columns[1])-1
+        ref = columns[3]
+        alt = columns[4]
+        quality = columns[6]
+        if ',' in alt:
+            continue
+        snpType = 'S'
+        if len(ref) > len(alt):
+            snpType = 'D'
+        elif len(ref) < len(alt):
+            snpType = 'I'
+        #print pos, snpType, ref, alt
+        hash_refPos_snp[pos] = [snpType, ref, alt]
+        refPos_vcfEntry[pos] = line
+        #refPos_quality[pos] = quality
+    refFile.close()
+    ref_mismatch_file.close()
+    que_mismatch_file = open(args.false_positive, 'w')
+    print ('read query vcf file...')
+    hash_quePos_snp = {}
+    #quePos_snp = RedBlackTreeMap()
+    queFile = open(args.query)
+    for line in queFile.readlines():
+        if line.startswith("#"):
+            que_mismatch_file.write(line)
+            continue
+        line = line.strip()
+        columns = line.split("\t")
+        pos = int(columns[1])-1
+        ref = columns[3]
+        alt = columns[4]
+        if ',' in alt:
+            continue
+        snpType = 'S'
+        if len(ref) > len(alt):
+            snpType = 'D'
+        elif len(ref) < len(alt):
+            snpType = 'I'
+        hash_quePos_snp[pos] = [snpType, ref, alt]
+        quePos_vcfEntry[pos] = line
+    queFile.close()
+    que_mismatch_file.close()
+    refOriginalNum = len(hash_refPos_snp)
+    queOriginalNum = len(hash_quePos_snp)
+    print ('first stage start...')
+    if refOriginalNum > 0 and queOriginalNum > 0:
+        direct_search(hash_refPos_snp, hash_quePos_snp)
+    #print ("after direct search: ", len(hash_refPos_snp), len(hash_quePos_snp))
+    if args.direct_search:
+        report(hash_refPos_snp, hash_quePos_snp, refOriginalNum, queOriginalNum)
+        return
+    refPos_snp = RedBlackTreeMap()
+    quePos_snp = RedBlackTreeMap()
+    for k in hash_refPos_snp:
+        refPos_snp[k] = hash_refPos_snp[k]
+    for k in hash_quePos_snp:
+        quePos_snp[k] = hash_quePos_snp[k]
+    print ('second stage start...')
+    if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+        complex_search(refPos_snp, quePos_snp, sequence, False)
+    if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+        complex_search(quePos_snp, refPos_snp, sequence, True)
+    #print ("after complex search:", len(refPos_snp), len(quePos_snp))
+    print ('third stage start...')
+    for block_size in [2, 4, 5,10,20,50,100,200]:
+        print ('try window size ' + str(block_size*2) + '...')
+        if len(refPos_snp) > 0 and len(quePos_snp) > 0:
+            multi_search(refPos_snp, quePos_snp, sequence, block_size)
+        #print ('after multi search in ' + str(block_size) + ' bp range:', len(refPos_snp), len(quePos_snp))
+    report(refPos_snp, quePos_snp, refOriginalNum, queOriginalNum)
+if __name__ == '__main__':
+    main()
diff --git a/script/add_marker.py b/script/add_marker.py
new file mode 100644
index 0000000..e69de29
diff --git a/script/compare_match.py b/script/compare_match.py
new file mode 100644
index 0000000..7a90f60
--- /dev/null
+++ b/script/compare_match.py
@@ -0,0 +1,44 @@
+from sys import argv
+baseline_filename = argv[1]
+query_filename = argv[2]
+baseline_pos_line = {}
+baseline_pos_content = {}
+query_pos_line = {}
+query_pos_content = {}
+def read_file(filename):
+    pos_line = {}
+    pos_content = {}
+    with open (filename) as f:
+        for line in f:
+            if(line.startswith('#')):
+                continue
+            line = line.strip()
+            columns = line.split('\t')
+            content = '\t'.join(columns[1:-1])
+            pos = int(columns[1])
+            pos_line[pos] = line
+            pos_content[pos] = content
+    return pos_line, pos_content
+(baseline_pos_line, baseline_pos_content)= read_file(baseline_filename)
+(query_pos_line, query_pos_content)= read_file(query_filename)
+for pos in baseline_pos_content:
+    if pos not in query_pos_content:
+        print pos, "exist in baseline but not in query"
+        print baseline_pos_line[pos]
+    else:
+        if baseline_pos_content[pos] != query_pos_content[pos]:
+            print "same pos but not equal content"
+            print baseline_pos_line[pos]
+            print query_pos_line[pos]
+for pos in query_pos_content:
+    if pos not in baseline_pos_content:
+        print pos, "in query but not in baseline"
+        print query_pos_line[pos]
\ No newline at end of file
diff --git a/script/count_decomposed_matching.py b/script/count_decomposed_matching.py
new file mode 100644
index 0000000..faedac4
--- /dev/null
+++ b/script/count_decomposed_matching.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Chen Sun(chensun at cse.psu.edu)
+    Paul Medvedev(pashadag at cse.psu.edu)
+from sys import argv
+decomposed_filename = argv[1]
+matching_filename = argv[2]
\ No newline at end of file
diff --git a/script/direct_match.py b/script/direct_match.py
new file mode 100644
index 0000000..70b45cd
--- /dev/null
+++ b/script/direct_match.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Chen Sun(chensun at cse.psu.edu)
+    Paul Medvedev(pashadag at cse.psu.edu)
+from sys import argv
+baseline_filename = argv[1]
+query_filename = argv[2]
+baseline_variant = {}
+query_variant = {}
\ No newline at end of file
diff --git a/script/filter_hc.py b/script/filter_hc.py
new file mode 100644
index 0000000..d9f10eb
--- /dev/null
+++ b/script/filter_hc.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Paul Medvedev(pashadag at cse.psu.edu)
+    Chen Sun(chensun at cse.psu.edu)
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+from intervaltree import Interval, IntervalTree
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+chr_2_index = {}
+index_2_chr = {}
+for i in range(22):
+    chr_2_index[str(i+1)] = i
+    index_2_chr[i] = str(i+1)
+chr_2_index['X'] = 22
+index_2_chr[22] = 'X'
+chr_2_index['Y'] = 23
+index_2_chr[23] = 'Y'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="filter_hc", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('vcf_files', nargs='+', metavar='File List', help='VCF file list')
+parser.add_argument('-b', '--bed_file', help='bed hc file', metavar='File')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./nhc')
+args = parser.parse_args()
+if not os.path.exists(args.output):
+    os.mkdir(args.output)
+interval_tree_list = []
+for i in range(24):
+    interval_tree_list.append(IntervalTree())
+with open (args.bed_file) as bed:
+    for line in bed.readlines():
+        if line.startswith('#'):
+            continue
+        columns = line.split('\t')
+        chr_name = columns[0]
+        if chr_name not in chr_2_index:
+            print 'BED: ' + line
+            continue
+        chr_index = chr_2_index[chr_name]
+        start_p = int(columns[1])
+        end_p = int(columns[2]) # 0 based, exclude end position
+        interval_tree_list[chr_index][start_p: end_p] = (start_p, end_p)
+for vcf_filename in args.vcf_files:
+    match_basename = os.path.basename(vcf_filename)
+    nhc_filename = args.output + '/' + match_basename + '.nhc.vcf'
+    output_list = []
+    with open(vcf_filename) as vcf_file:
+        for line in vcf_file.readlines():
+            if line.startswith('#'):
+                output_list.append(line)
+                continue
+            columns = line.split('\t')
+            chr_name = columns[0]
+            if chr_name not in chr_2_index:
+                output_list.append(line)
+                continue
+            chr_index = chr_2_index[chr_name]
+            var_pos = int(columns[1]) - 1 # 1 based system to 0 based system
+            query_result = interval_tree_list[chr_index][var_pos]
+            if len(query_result) == 0:
+                output_list.append(line)
+    nhc_file = open(nhc_filename, 'w')
+    for line in output_list:
+        nhc_file.write(line)
+    nhc_file.close()
\ No newline at end of file
diff --git a/script/filter_lcr.py b/script/filter_lcr.py
new file mode 100644
index 0000000..d9f10eb
--- /dev/null
+++ b/script/filter_lcr.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Paul Medvedev(pashadag at cse.psu.edu)
+    Chen Sun(chensun at cse.psu.edu)
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+from intervaltree import Interval, IntervalTree
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+chr_2_index = {}
+index_2_chr = {}
+for i in range(22):
+    chr_2_index[str(i+1)] = i
+    index_2_chr[i] = str(i+1)
+chr_2_index['X'] = 22
+index_2_chr[22] = 'X'
+chr_2_index['Y'] = 23
+index_2_chr[23] = 'Y'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="filter_hc", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('vcf_files', nargs='+', metavar='File List', help='VCF file list')
+parser.add_argument('-b', '--bed_file', help='bed hc file', metavar='File')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./nhc')
+args = parser.parse_args()
+if not os.path.exists(args.output):
+    os.mkdir(args.output)
+interval_tree_list = []
+for i in range(24):
+    interval_tree_list.append(IntervalTree())
+with open (args.bed_file) as bed:
+    for line in bed.readlines():
+        if line.startswith('#'):
+            continue
+        columns = line.split('\t')
+        chr_name = columns[0]
+        if chr_name not in chr_2_index:
+            print 'BED: ' + line
+            continue
+        chr_index = chr_2_index[chr_name]
+        start_p = int(columns[1])
+        end_p = int(columns[2]) # 0 based, exclude end position
+        interval_tree_list[chr_index][start_p: end_p] = (start_p, end_p)
+for vcf_filename in args.vcf_files:
+    match_basename = os.path.basename(vcf_filename)
+    nhc_filename = args.output + '/' + match_basename + '.nhc.vcf'
+    output_list = []
+    with open(vcf_filename) as vcf_file:
+        for line in vcf_file.readlines():
+            if line.startswith('#'):
+                output_list.append(line)
+                continue
+            columns = line.split('\t')
+            chr_name = columns[0]
+            if chr_name not in chr_2_index:
+                output_list.append(line)
+                continue
+            chr_index = chr_2_index[chr_name]
+            var_pos = int(columns[1]) - 1 # 1 based system to 0 based system
+            query_result = interval_tree_list[chr_index][var_pos]
+            if len(query_result) == 0:
+                output_list.append(line)
+    nhc_file = open(nhc_filename, 'w')
+    for line in output_list:
+        nhc_file.write(line)
+    nhc_file.close()
\ No newline at end of file
diff --git a/script/overlap.py b/script/overlap.py
new file mode 100644
index 0000000..90673e0
--- /dev/null
+++ b/script/overlap.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Chen Sun(chensun at cse.psu.edu)
+    Paul Medvedev(pashadag at cse.psu.edu)
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib_venn import venn2, venn2_circles
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="overlap", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File List', help='.match file list, should take the same VCF as baseline')
+args = parser.parse_args()
+baselinevar_num = {}
+# key is baseline variant, identified by chr_pos_ref_alt_phasing
+variant_num_list = []
+def read_file(filename):
+    global baselinevar_num
+    global variant_num
+    variant_num = 0
+    with open(filename) as f:
+        for line in f:
+            if line.startswith('#'):
+                continue
+            line = line.strip()
+            columns = line.split('\t')
+            chrname = columns[0]
+            baseline_columns = columns[4].split(';')
+            if baseline_columns[0] == '.':
+                baseline_key = chrname + ',' + columns[1]# + ',' + columns[2] + ',' + columns[3]
+                #print baseline_key
+                if baseline_key in baselinevar_num:
+                    baselinevar_num[baseline_key] += 1
+                else:
+                    baselinevar_num[baseline_key] = 1
+                variant_num += 1
+                continue
+            for baseline in baseline_columns:
+                baseline_key = chrname + ',' + baseline.split(',')[0]
+                #print baseline_key
+                if baseline_key in baselinevar_num:
+                    baselinevar_num[baseline_key] += 1
+                else:
+                    baselinevar_num[baseline_key] = 1
+                variant_num += 1
+    variant_num_list.append(variant_num)
+for filename in args.match_files:
+    read_file(filename)
+overlap_threshold = len(args.match_files)
+print overlap_threshold
+overlap_num = 0
+for baseline_key in baselinevar_num:
+    if baselinevar_num[baseline_key] >= overlap_threshold:
+        overlap_num += 1
+print overlap_num, variant_num_list
+# Subset sizes
+s = (
+    variant_num_list[0]-overlap_num,  # Ab
+    variant_num_list[1]-overlap_num,  # aB
+    overlap_num,  # AB
+v = venn2(subsets=s, set_labels=('bwa-fb', 'pt'))
+# Subset labels
+v.get_label_by_id('10').set_text(format(s[0], ',d'))
+v.get_label_by_id('01').set_text(format(s[1], ',d'))
+v.get_label_by_id('11').set_text(format(s[2], ',d'))
+# Subset colors
+# Subset alphas
+for text in v.set_labels:
+    text.set_fontsize(32)
+for text in v.subset_labels:
+    text.set_fontsize(32)
+# Border styles
+c = venn2_circles(subsets=s, linestyle='solid', linewidth='0')
+#c[0].set_ls('dashed')  # Line style
+#c[0].set_lw(2.0)       # Line width
diff --git a/script/overlap_direct.py b/script/overlap_direct.py
new file mode 100644
index 0000000..ee7c6e0
--- /dev/null
+++ b/script/overlap_direct.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Chen Sun(chensun at cse.psu.edu)
+    Paul Medvedev(pashadag at cse.psu.edu)
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+import matplotlib
+import matplotlib.pyplot as plt
+from matplotlib_venn import venn2, venn2_circles
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="overlap", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File List', help='.match file list, should take the same VCF as baseline')
+args = parser.parse_args()
+baselinevar_num = {}
+# key is baseline variant, identified by chr_pos_ref_alt_phasing
+variant_num_list = []
+def read_file(filename):
+    global baselinevar_num
+    global variant_num
+    variant_num = 0
+    with open(filename) as f:
+        for line in f:
+            if line.startswith('#'):
+                continue
+            line = line.strip()
+            columns = line.split('\t')
+            chrname = columns[0]
+            #baseline_columns = columns[4].split(';')
+            baseline_key = chrname + ',' + columns[1]# + ',' + columns[2] + ',' + columns[3]
+            #print baseline_key
+            if baseline_key in baselinevar_num:
+                baselinevar_num[baseline_key] += 1
+            else:
+                baselinevar_num[baseline_key] = 1
+            variant_num += 1
+    variant_num_list.append(variant_num)
+for filename in args.match_files:
+    read_file(filename)
+overlap_threshold = len(args.match_files)
+print overlap_threshold
+overlap_num = 0
+for baseline_key in baselinevar_num:
+    if baselinevar_num[baseline_key] >= overlap_threshold:
+        overlap_num += 1
+print overlap_num, variant_num_list
+# Subset sizes
+s = (
+    variant_num_list[0]-overlap_num,  # Ab
+    variant_num_list[1]-overlap_num,  # aB
+    overlap_num,  # AB
+v = venn2(subsets=s, set_labels=('bwa-fb', 'pt'))
+# Subset labels
+v.get_label_by_id('10').set_text(format(s[0], ',d'))
+v.get_label_by_id('01').set_text(format(s[1], ',d'))
+v.get_label_by_id('11').set_text(format(s[2], ',d'))
+# Subset colors
+# Subset alphas
+for text in v.set_labels:
+    text.set_fontsize(32)
+for text in v.subset_labels:
+    text.set_fontsize(32)
+# Border styles
+c = venn2_circles(subsets=s, linestyle='solid', linewidth='0')
+#c[0].set_ls('dashed')  # Line style
+#c[0].set_lw(2.0)       # Line width
diff --git a/script/varmatch b/script/varmatch
new file mode 100644
index 0000000..1b4bc6c
--- /dev/null
+++ b/script/varmatch
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+import sys
+import subprocess
+import argparse
+import os
+import time
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+versionError = 'You are using an old version of python, please upgrade to python 2.7+\n'
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+citation = 'Please cite our paper'
+parser = argparse.ArgumentParser(epilog=citation)
+parser.add_argument('-r', metavar='reference.vcf', help='reference vcf file path')
+parser.add_argument('-q', metavar='query.vcf', help='query vcf file path')
+parser.add_argument('-g', metavar='genome.fa', help='genome sequence file path, FASTA file format')
+parser.add_argument('-t', metavar='N', default='1', help='thread number for parallel')
+parser.add_argument('-n', '--normalize', action='store_true',
+                    help='if activate, VarMatch will normalize reference vcf and query vcf file before comparing.')
+#parser.add_argument('-d', '--direct_search', action='store_true', help='if activate, only perform direct matching')
+parser.add_argument('--multi_genome', metavar='genome_list.txt',
+                    help='genome list file contain chromosome name and FASTA file absolute path')
+parser.add_argument('--multi_vcf', nargs='+', metavar='file.vcf ...',
+                    help='vcf files (usually more than two) that need to compare')
+parser.add_argument('-o', '--output', metavar='output/', help='output directory, default is the current directory')
+#parser.add_argument('-v', '--visualize', help='visualize results')
+parser.add_argument('--purify', action='store_true',
+                    help='if activate, VarMatch will check if variant matches reference genome sequence.')
+parser.add_argument('-H', action='store_true',
+                    help='if active, VarMatch will not match haplotype')
+#parser.add_argument('--remove_dup', metavar='single.vcf',
+#                    help='provide single vcf file, remove duplications in this vcf file')
+# strategy for whole genome:
+#   since current version of vt-normalize can not handle whole genome sequence data
+#   vcfcompare split whole genome data according to chromosome detected in --whole-genome directory
+args = parser.parse_args()
+human_chromosome_list = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
+                         '18', '19', '20', '21', '22', 'X', 'Y']
+def shell_run(command, hide=False):
+    if not RUN:
+        time.sleep(3.5)
+        print(command)
+    else:
+        # print(command)
+        if hide:  # hide output
+            FNULL = open(os.devnull, 'w')
+            subprocess.call(command, shell=True, stdout=FNULL, stderr=subprocess.STDOUT)
+            # subprocess.call(command, shell=True, stdout=FNULL)
+            FNULL.close()
+        else:
+            subprocess.call(command, shell=True)
+def check_command(command):
+    """
+    check if corresponding command available
+    """
+    if os.path.isfile(command):
+        return True
+    for cmdpath in os.environ['PATH'].split(':'):
+        if os.path.isdir(cmdpath) and command in os.listdir(cmdpath):
+            return True
+    return False
+# [todo] check vcf files, corresponding genome file should exist
+# purify vcf file
+def purify(input_file, output_file, genome_file):
+    global check_purify_command
+    if not check_purify_command and not check_command(purify_tool):
+        print ('Error: can not find program: ' + purify_tool)
+        print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+        exit()
+    else:
+        check_purify_command = True
+    purify_command = purify_tool + ' -i ' + input_file + ' -g ' + genome_file + ' -o ' + output_file
+    shell_run(purify_command)
+def pairwise_compare(reference_file, query_file, genome_file, output_prefix):
+    global check_compare_command
+    if not check_compare_command and not check_command(compare_tool):
+        print ('Error: can not find program: ' + compare_tool)
+        print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+        exit()
+    else:
+        check_compare_command = True
+    compare_command = compare_tool + ' -r ' + reference_file + ' -q ' + query_file + ' -g ' + genome_file + ' -o ' + output_prefix
+    if args.H:
+        compare_command += ' -G '
+    if args.normalize:
+        compare_command += ' -n '
+    if args.t is not None and int(args.t) > 1:
+        compare_command += ' -t ' + args.t
+    shell_run(compare_command)
+def varmatch_pairwise(reference_file, query_file, genome_file, output_directory):
+    ref_basename = os.path.basename(reference_file)
+    que_basename = os.path.basename(query_file)
+    ref_purify_file = temp_dir + '/' + ref_basename + '.purify.vcf'
+    que_purify_file = temp_dir + '/' + que_basename + '.purify.vcf'
+    if args.purify:
+        purify(reference_file, ref_purify_file, genome_file)
+        purify(query_file, que_purify_file, genome_file)
+    else:
+        ref_purify_file = reference_file
+        que_purify_file = query_file
+    output_prefix = output_directory + '/' + ref_basename + '_' + que_basename
+    pairwise_compare(ref_purify_file, que_purify_file, genome_file, output_prefix)
+    return output_prefix
+def detect_multi_genome(genome_list_file, chr_list):
+    genome_dict = {}
+    with open(genome_list_file) as f:
+        for line in f.readlines():
+            line = line.strip()
+            columns = line.split()
+            chr_name = columns[0]
+            if chr_name in chr_list:
+                genome_dict[chr_name] = columns[1]
+    return genome_dict
+def split_multi_genome(vcf_file, detected_chr_list):
+    print ('Split variant file according to chromosomes...')
+    vcf_name_dict = {}
+    basename = os.path.basename(vcf_file)
+    for c in detected_chr_list:
+        vcf_name_dict[c] = temp_dir + '/' + basename + '.' + c + '.vcf'
+    vcf_handle_dict = {}
+    for c in detected_chr_list:
+        vcf_handle_dict[c] = open(vcf_name_dict[c], 'w')
+    with open(vcf_file) as f:
+        for line in f.readlines():
+            if line.startswith('#'):
+                for c in detected_chr_list:
+                    vcf_handle_dict[c].write(line)
+                continue
+            for c in detected_chr_list:
+                chromosome_name = line.split('\t')[0]
+                if chromosome_name == c or chromosome_name == 'chr' + c:
+                    vcf_handle_dict[c].write(line)
+                    break
+    for c in detected_chr_list:
+        vcf_handle_dict[c].close()
+    return vcf_name_dict
+def varmatch_multi_genome(reference_file, query_file, genome_list_file):
+    # split vcf according to chromosome and then use varmatch_pairwise
+    genome_dict = detect_multi_genome(genome_list_file, human_chromosome_list)
+    detected_chr_list = list(genome_dict.keys())
+    detected_chr_list.sort()
+    print ('\t[Multiple genome mode]')
+    print ('\tDetected genomes:')
+    chr_list_string = '\t'
+    # print (chr_list_string, detected_chr_list)
+    for c in detected_chr_list:
+        chr_list_string += c + ','
+    chr_list_string = chr_list_string[:-1] + '\n'
+    print (chr_list_string)
+    for c in detected_chr_list:
+        if not os.path.isfile(genome_dict[c]):
+            print ('[Error:] Can not find genome file ' + genome_dict[c])
+            exit()
+    ref_vcf_dict = split_multi_genome(reference_file, detected_chr_list)
+    que_vcf_dict = split_multi_genome(query_file, detected_chr_list)
+    for c in detected_chr_list:
+        print('Matching chromosome ' + c + '...')
+        varmatch_pairwise(ref_vcf_dict[c], que_vcf_dict[c], genome_dict[c], temp_dir)
+    chr_stat_dict = {}
+    total_stat = [0] * 6 # total_ref, total_que, matched_ref, matched_que, mismatch_ref, mismatch_que
+    for c in detected_chr_list:
+        stat_file = temp_dir + '/' + os.path.basename(ref_vcf_dict[c]) + '_' + os.path.basename(
+            que_vcf_dict[c]) + '.stat'
+        chr_stat_dict[c] = stat_file
+        with open(stat_file) as f:
+            lines = f.readlines()
+            for i in range(len(total_stat)):
+                if i < len(total_stat):
+                    total_stat[i] += int(lines[i].strip())
+    total_stat_filename = output_dir + '/' + os.path.basename(reference_file) + '_' + os.path.basename(
+        query_file) + '.stat'
+    with open(total_stat_filename, 'w') as f:
+        for s in total_stat:
+            f.write(str(s))
+            f.write('\n')
+    total_complex_filename = output_dir + '/' + os.path.basename(reference_file) + '_' + os.path.basename(
+        query_file) + '.match'
+    total_complex_file = open(total_complex_filename, 'w')
+    total_complex_file.write('##VCF1:'+reference_file+'\n')
+    total_complex_file.write('##VCF2:'+query_file+'\n')
+    total_complex_file.write('#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\n')
+    for c in detected_chr_list:
+        chr_complex_file = temp_dir + '/' + os.path.basename(ref_vcf_dict[c]) + '_' + os.path.basename(
+            que_vcf_dict[c]) + '.match'
+        with open(chr_complex_file) as complex_f:
+            chr_content = complex_f.readlines()
+            for line in chr_content:
+                if line.startswith('#'):
+                    continue
+                total_complex_file.write(line)
+    total_complex_file.close()
+def varmatch_multi_vcf_multi_genome(multi_vcf_list, genome_list_file):
+    genome_dict = detect_multi_genome(genome_list_file, human_chromosome_list)
+    detected_chr_list = list(genome_dict.keys())
+    detected_chr_list.sort()
+    print ('\t[Multiple genome multiple vcf mode]')
+    print ('\tDetected genomes:')
+    chr_list_string = '\t'
+    for c in detected_chr_list:
+        chr_list_string += c + ','
+    chr_list_string = chr_list_string[:-1] + '\n'
+    print (chr_list_string)
+    for c in detected_chr_list:
+        if not os.path.isfile(genome_dict[c]):
+            print ('[VarMatch:Error:] Can not find genome file ' + genome_dict[c])
+            exit()
+    id_vcf_dict = {}
+    for i in range(len(multi_vcf_list)):
+        if not os.path.isfile(multi_vcf_list[i]):
+            print ('[VarMatch:Error:] Can not find vcf file ' + multi_vcf_list[i])
+            exit()
+        id_vcf_dict[i] = multi_vcf_list[i]
+    vcfid_list = list(id_vcf_dict.keys())
+    vcfid_list.sort()
+    vcfid_chr_vcfsplit_dict = {}  # this is dict of dict, key is id, value is a dict with key as chr, value as vcf
+    for vcfid in vcfid_list:
+        vcffile = id_vcf_dict[vcfid]
+        vcfsplit_dict = split_multi_genome(vcffile, detected_chr_list)
+        vcfid_chr_vcfsplit_dict[vcfid] = vcfsplit_dict
+    chr_prefix_dict = {}
+    for c in detected_chr_list:
+        # create id_singlechrvcf_dict
+        id_singlechrvcf_dict = {}
+        for vcfid in vcfid_chr_vcfsplit_dict:
+            id_singlechrvcf_dict[vcfid] = vcfid_chr_vcfsplit_dict[vcfid][c]
+        output_prefix = temp_dir + '/common.' + c
+        chr_prefix_dict[c] = output_prefix
+        varmatch_multi_vcf_single_genome(id_singlechrvcf_dict, genome_dict[c], output_prefix)
+    # merge multi chromosome
+    common_filename = output_dir + '/common.match'
+    common_file = open(common_filename, 'w')
+    # write vcf file names
+    # write title
+    for id in vcfid_list:
+        common_file.write('##VCF' + str(id + 1) + ':' + id_vcf_dict[id] + '\n')
+    head_line = '#CHROM\tPOS\tREF\tALT'
+    for id in vcfid_list:
+        head_line += '\tVCF' + str(id + 1)
+    head_line += '\n'
+    common_file.write(head_line)
+    for c in chr_prefix_dict:
+        with open(chr_prefix_dict[c] + '.match') as f:
+            for line in f.readlines():
+                if line.startswith('#'):
+                    continue
+                common_file.write(line)
+    common_file.close()
+def varmatch_multi_vcf_single_genome(id_vcf_dict, genome_file, output_prefix):
+    """
+    id_vcf_dict key: id, value: vcf file
+    """
+    id_finalname_dict = {}
+    finalname_id_dict = {}
+    id_list = list(id_vcf_dict.keys())
+    id_list.sort()
+    for id in id_list:
+        vcf_file = id_vcf_dict[id]
+        if not os.path.isfile(vcf_file):
+            print('Error: Can not open vcf file ' + vcf_file)
+        vcf_file_id = id
+        basename = os.path.basename(vcf_file)
+        purify_file = temp_dir + '/' + basename + '.purify.vcf'
+        if args.purify:
+            purify(vcf_file, purify_file, genome_file)
+        else:
+            purify_file = vcf_file
+        id_finalname_dict[vcf_file_id] = purify_file
+    pairwise_prefix_idtuple = {}
+    idtuple_pairwise_prefix = {}
+    for i in range(len(id_list) - 1):
+        ref_id = id_list[i]
+        que_id = id_list[i + 1]
+        id_tuple = (ref_id, que_id)
+        ref_filename = id_finalname_dict[ref_id]
+        que_filename = id_finalname_dict[que_id]
+        pairwise_prefix = varmatch_pairwise(ref_filename, que_filename, genome_file, temp_dir)
+        pairwise_prefix_idtuple[pairwise_prefix] = id_tuple
+        idtuple_pairwise_prefix[id_tuple] = pairwise_prefix
+    # summarize
+    # [todo] summarize simple matches
+    # summarize complex matches
+    # [todo] summarize matching number
+    # variantid = position + ref.toupper + alt.toupper
+    variantid_variant = {}
+    variantid_info = {}
+    for i in range(len(id_list) - 1):
+        ref_id = id_list[i]
+        que_id = id_list[i + 1]
+        id_tuple = (ref_id, que_id)
+        pairwise_prefix = idtuple_pairwise_prefix[id_tuple]
+        complex_match_file = pairwise_prefix + '.match'
+        if not os.path.isfile(complex_match_file):
+            print('Error: Can not open match result ' + complex_match_file)
+        with open(complex_match_file) as f:
+            for line in f.readlines():
+                if line.startswith('#'):
+                    continue
+                line = line.strip()
+                columns = line.split('\t')
+                variantid = '@'.join(columns[1:4])
+                if i == 0:
+                    variantid_variant[variantid] = columns[:4]
+                    variantid_info[variantid] = columns[4:]
+                else:
+                    if variantid in variantid_info:
+                        variantid_info[variantid].append(columns[-1])
+                        # else:
+                        #    print variantid
+    integrate_complex_filename = output_prefix + '.match'
+    integrate_complex_file = open(integrate_complex_filename, 'w')
+    for i in range(len(id_list)):
+        head_line = '##VCF' + str(i + 1) + ':' + id_vcf_dict[i] + '\n'
+        integrate_complex_file.write(head_line)
+    head_line = '#CHROM\tPOS\tREF\tALT'
+    for i in range(len(id_list)):
+        head_line += '\tVCF' + str(i + 1)
+    integrate_complex_file.write(head_line)
+    # print '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
+    common_complex_num = 0
+    for variantid in sorted(variantid_info):
+        if len(variantid_info[variantid]) == len(id_list):
+            common_complex_num += 1
+            merge_list = variantid_variant[variantid] + variantid_info[variantid]
+            variant_line = '\t'.join(merge_list)
+            variant_line += '\n'
+            integrate_complex_file.write(variant_line)
+            # else:
+            #    print variantid
+    integrate_complex_file.close()
+    print(common_complex_num)
+def remove_duplicate(genome_filename, single_vcf_filename, output_prefix):
+    print ("current version does not support remvoe duplicate")
+    exit()
+    remove_duplicate_command = compare_tool + ' -m ' + single_vcf_filename + ' -g ' + genome_filename + ' -o ' + output_prefix
+    if args.t is not None and int(args.t) > 1:
+        remove_duplicate_command += ' -t ' + args.t
+    shell_run(remove_duplicate_command)
+def main():
+    if len(sys.argv) < 2:
+        parser.print_help()
+        exit()
+    # initialize global variables
+    global check_purify_command
+    global check_normalize_command
+    global check_compare_command
+    global script_path
+    global purify_tool
+    global compare_tool
+    global output_dir
+    global visual_dir
+    global temp_dir
+    check_purify_command = False
+    check_normalize_command = False
+    check_compare_command = True
+    script_path = sys.path[0]
+    purify_tool = script_path + '/purify'
+    compare_tool = script_path + '/vm'
+    output_dir = ''
+    visual_dir = ''
+    temp_dir = ''
+    # create output directory
+    if args.output is None or args.output == '':
+        output_dir = os.getcwd() + '/output'
+    else:
+        output_dir = args.output
+    if output_dir == '':
+        output_dir = os.getcwd() + '/output'
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    temp_dir = output_dir + '/temp'
+    visual_dir = output_dir + '/visualization'
+    if not os.path.exists(temp_dir):
+        os.mkdir(temp_dir)
+    # print args.r, args.q
+    if args.remove_dup is not None and args.remove_dup != '':
+        if not os.path.isfile(args.remove_dup):
+            print ('\tError in remove duplication mode:\n')
+            print ('\tCan not find vcf file: ' + args.remove_dup)
+        basename = os.path.basename(args.remove_dup)
+        output_prefix = output_dir + '/' + basename + '.nodup'
+        remove_duplicate(args.g, args.remove_dup, output_prefix)
+    if args.multi_genome is not None and args.multi_genome != '':
+        if args.multi_vcf is not None:
+            # multi genome, multi vcf
+            varmatch_multi_vcf_multi_genome(args.multi_vcf, args.multi_genome)
+            pass
+        elif args.remove_dup is not None:
+            # multi genome, single vcf(remove duplicates)
+            pass
+        else:
+            # pure multi genome, to compare two genome
+            varmatch_multi_genome(args.r, args.q, args.multi_genome)
+    elif args.multi_vcf is not None:
+        # multi vcf, single chromosome
+        output_prefix = output_dir + '/common'
+        id_vcf_dict = {}
+        id = 0
+        for vcf_file in args.multi_vcf:
+            id_vcf_dict[id] = vcf_file
+            id += 1
+        varmatch_multi_vcf_single_genome(id_vcf_dict, args.g, output_prefix)
+    elif args.remove_dup is not None:
+        pass
+    else:
+        # single chromosome, pairwise compare
+        varmatch_pairwise(args.r, args.q, args.g, output_dir)
+if __name__ == '__main__':
+    main()
diff --git a/src/diploid.cpp b/src/diploid.cpp
new file mode 100644
index 0000000..9b5470b
--- /dev/null
+++ b/src/diploid.cpp
@@ -0,0 +1,3562 @@
+// code
+// author: Chen Sun, chensun at cse.psu.edu
+#include "diploid.h"
+// inline function protected
+// code reviewed by Channing
+inline bool CompareSequence(string s1, string s2) {
+	transform(s1.begin(), s1.end(), s1.begin(), ::toupper);
+	transform(s2.begin(), s2.end(), s2.begin(), ::toupper);
+	return s1 == s2;
+inline bool PrefixMatch( std::string const& lhs, std::string const& rhs )
+    return std::equal(
+        lhs.begin(),
+        lhs.begin() + std::min( lhs.size(), rhs.size() ),
+        rhs.begin() );
+DiploidVCF::DiploidVCF(int thread_num_):VCF(thread_num_)
+    scoring_basepair = false;
+	dout << "DiploidVCF() Thread Number: " << thread_num << endl;
+// private
+int DiploidVCF::ReadRefVCF(string filename) {
+    return ReadDiploidVCF(filename, ref_variant_list, 0);
+// private
+int DiploidVCF::ReadQueryVCF(string filename) {
+    return ReadDiploidVCF(filename, que_variant_list, 1);
+// protected
+// [todo] unit test normalization
+// normalization modifies vt normalize algorithm
+// code reviewed by Channing 4/2/2016
+bool DiploidVCF::NormalizeDiploidVariant(DiploidVariant & var) {
+	int pos = var.pos;
+	string parsimonious_ref = var.ref;
+	string parsimonious_alt0 = var.alts[0];
+	string parsimonious_alt1 = var.alts[0];
+	if (var.heterozygous && var.multi_alts)
+		parsimonious_alt1 = var.alts[1];
+	int left_index = pos;
+	if (genome_sequence.size() == 0) return false;
+	if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+	if (toupper(genome_sequence[left_index]) != toupper(parsimonious_ref[0])) {
+		dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+		return false;
+	}
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt0.back()) && toupper(parsimonious_ref.back()) == toupper(parsimonious_alt1.back())) {
+			if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+				parsimonious_ref.pop_back();
+				parsimonious_alt0.pop_back();
+				parsimonious_alt1.pop_back();
+				change_in_allels = true;
+			}
+            // else do not make further changes
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+			left_index--;
+			char left_char = toupper(genome_sequence[left_index]);
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt0 = left_char + parsimonious_alt0;
+			parsimonious_alt1 = left_char + parsimonious_alt1;
+		}
+	}
+	while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt0[0]) &&
+            toupper(parsimonious_ref[0]) == toupper(parsimonious_alt1[0]) &&
+            parsimonious_ref.size() > 1 &&
+            parsimonious_alt0.size() > 1 &&
+            parsimonious_alt1.size() > 1)
+    {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt0.erase(0, 1);
+		parsimonious_alt1.erase(0, 1);
+        left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+	}
+	var.pos = left_index;
+	var.ref = parsimonious_ref;
+	var.alts[0] = parsimonious_alt0;
+	if (var.heterozygous && var.multi_alts)
+		var.alts[1] = parsimonious_alt1;
+	return true;
+int DiploidVCF::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1) {
+	int left_index = pos;
+	if (genome_sequence.size() == 0) return -1;
+	if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+			if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+				parsimonious_ref.pop_back();
+				parsimonious_alt0.pop_back();
+				parsimonious_alt1.pop_back();
+				change_in_allels = true;
+			}
+            // else do not make further changes
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+			left_index--;
+			char left_char = toupper(genome_sequence[left_index]);
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt0 = left_char + parsimonious_alt0;
+			parsimonious_alt1 = left_char + parsimonious_alt1;
+		}
+	}
+	while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+            parsimonious_ref[0] == parsimonious_alt1[0] &&
+            parsimonious_ref.size() > 1 &&
+            parsimonious_alt0.size() > 1 &&
+            parsimonious_alt1.size() > 1)
+    {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt0.erase(0, 1);
+		parsimonious_alt1.erase(0, 1);
+        left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+	}
+	return left_index;
+void DiploidVCF::ReadGenome(string filename) {
+	ifstream genome_file;
+	genome_file.open(filename.c_str());
+	if (!genome_file.good()) {
+		cout << "[VarMatch] can not open FASTA file: ";
+		cout << filename << endl;
+		return;
+	}
+	genome_sequence = "";
+	while(!genome_file.eof()) {
+		string line;
+		getline(genome_file, line, '\n');
+		if ((int)line.length() <= 1) continue;
+		if (line[0] == '>') continue;
+		genome_sequence += line;
+	}
+	genome_file.close();
+	return;
+// protected
+// code reviewed by Channing and Succulent on 4/2/2016
+int DiploidVCF::ReadDiploidVCF(string filename, vector<DiploidVariant> & x_variant_list, int flag) {
+    // read and change all sequence to upper case
+    int total_num = 0;
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return -1;
+	}
+	int genotype_index = -1;
+	char genotype_separator = '/';
+	//int genome_sequence_length = genome_sequence.length();
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+		//dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		//if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#') {
+//			if (line[1] == '#') continue;
+//			auto head_names = split(line, '\t');
+//			if (head_names.size() < 10 && match_genotype) {
+//				cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+//				cout << "[VarMatch] \tVCF file name " << filename << endl;
+//				cout << "[VarMatch] \tAutomatically turn off genotype matching module." << endl;
+//				match_genotype = false;
+//			}
+			continue;
+		}
+		auto columns = split(line, '\t');
+		if (columns.size() < 10) {
+			if(match_genotype){
+                cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+                cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+                match_genotype = false;
+                continue;
+            }
+            if(columns.size() < 6){
+                cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+                cout << "[VarMatch] skip current variant: " << line << endl;
+                continue;
+            }
+		}
+		if (chromosome_name == ".") chromosome_name = columns[0];
+		auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+//        if(pos == 79240316){
+//            cout << "find snp from: " << flag << endl;
+//        }
+		auto ref = columns[3];
+		if(ref.size() >= VAR_LEN) continue;
+		auto alt_line = columns[4];
+		auto quality = columns[5];
+		ToUpper(ref);
+		ToUpper(alt_line);
+		bool is_heterozygous_variant = false;
+		bool is_multi_alternatives = false;
+		if (columns.size() >= 10) {
+			if (genotype_index < 0) {
+                auto formats = split(columns[8], ':');
+                for (int i = 0; i < formats.size(); i++) {
+                    if (formats[i] == "GT") {
+                        genotype_index = i;
+                        break;
+                    }
+                }
+                if(genotype_index < 0){
+                    cout << "[VarMatch] VCF entry does not contain genotype information" << endl;
+                    continue;
+                }
+			}
+			auto additionals = split(columns[9], ':');
+            vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+            if(genotype_columns.size() != 2){
+                genotype_separator = '|';
+                genotype_columns = split(additionals[genotype_index], genotype_separator);
+            }
+			// normalize format of genotype: sorted, separated by |
+			if (genotype_columns.size() != 2) {
+				cout << "[VarMatch] Warning Unrecognized Genotype: " << additionals[genotype_index] << endl;
+				continue;
+			}
+			else {
+				if (genotype_columns[0] != genotype_columns[1]) {
+					is_heterozygous_variant = true;
+				}
+			}
+            if (genotype_columns[1] == "0" && genotype_columns[0] == "0" && match_genotype) {
+                continue;
+            }
+		}
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos) {
+			alt_list = split(alt_line, ',');
+			if(alt_list[0].size() >= VAR_LEN || alt_list[1].size() >= VAR_LEN) continue;
+			is_multi_alternatives = true;
+		}
+		else {
+            if(alt_line.size() >= VAR_LEN) continue;
+			alt_list.push_back(alt_line);
+		}
+        int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+        int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+        if(is_multi_alternatives){
+            snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+            snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+        }
+		DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag);
+		if (normalization) {
+			NormalizeDiploidVariant(dv);
+		}
+        x_variant_list.push_back(dv);
+        total_num++;
+	}
+	vcf_file.close();
+	return total_num;
+// protected override
+// code reviewed by Channing and Succulent on 4/2/2016
+void DiploidVCF::DecideBoundaries() {
+	int genome_size = genome_sequence.size();
+    if(genome_size == 0){
+        dout << "[VarMatch] Warning: no genome sequence detected when decide boundries. " << endl;
+    }
+	int distance = genome_size / thread_num;
+	for (int i = 0; i < thread_num - 1; i++) {
+		pos_boundries.push_back((i + 1)*distance);
+	}
+	pos_boundries.push_back(genome_size);
+	for (int i = 0; i < thread_num; i++) {
+		refpos_2_var.push_back(unordered_map<int, DiploidVariant>());
+		querypos_2_var.push_back(unordered_map<int, DiploidVariant>());
+	}
+	boundries_decided = true;
+void DiploidVCF::DirectSearchInThread(unordered_map<int, DiploidVariant> & ref_snps,
+                                      unordered_map<int, DiploidVariant> & query_snps,
+                                      int thread_index) {
+	// handle heterozygous variants
+	auto rit = ref_snps.begin();
+	auto rend = ref_snps.end();
+	for (; rit != rend;) {
+		auto r_pos = rit->first;
+		DiploidVariant r_var = rit->second;
+		auto qit = query_snps.find(r_pos);
+		if (qit != query_snps.end()) {
+			DiploidVariant q_var = qit->second;
+			if (r_var == q_var) {
+				string matching_result = chromosome_name + '\t' + to_string(r_var.pos + 1) + "\t" + r_var.ref + "\t";
+				auto alt_string = r_var.alts[0];
+				if (r_var.multi_alts)
+					alt_string += "," + r_var.alts[1];
+				matching_result += alt_string;
+				direct_match_records[thread_index]->push_back(matching_result);
+				rit = ref_snps.erase(rit);
+				query_snps.erase(qit);
+			}
+			else {
+				++rit;
+			}
+		}
+		else {
+			++rit;
+		}
+	}
+// directly match by position
+// private
+void DiploidVCF::DirectSearchMultiThread() {
+	direct_match_records = new vector<string>*[thread_num];
+	for (int j = 0; j < thread_num; j++) {
+		direct_match_records[j] = new vector<string>;
+	}
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+		threads.push_back(thread(&DiploidVCF::DirectSearchInThread, this, ref(refpos_2_var[i]), ref(querypos_2_var[i]), i));
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	DirectSearchInThread(refpos_2_var[i], querypos_2_var[i], i);
+	// call join() on each thread in turn before this function?
+	std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+	threads.clear();
+	ofstream output_simple_file;
+	output_simple_file.open(output_simple_filename);
+	output_simple_file << "##VCF1:" << ref_vcf_filename << endl;
+	output_simple_file << "##VCF2:" << que_vcf_filename << endl;
+	output_simple_file << "#CHROM\tPOS\tREF\tALT" << endl;
+	for (int i = 0; i < thread_num; i++) {
+		for (int j = 0; j < direct_match_records[i]->size(); j++) {
+			output_simple_file << direct_match_records[i]->at(j) << endl;
+		}
+	}
+	output_simple_file.close();
+	for (int j = 0; j < thread_num; j++) {
+		delete direct_match_records[j];
+	}
+	delete[] direct_match_records;
+bool DiploidVCF::RecurrentVariantMatch(vector<DiploidVariant> & variant_list, int thread_index) {
+	sort(variant_list.begin(), variant_list.end());
+	map<int, DiploidVariant> separate_pos_var[2];
+	bool separate_contians_indel[2];
+	// separate into ref and que
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set or query set
+		int pos = variant_list[i].pos;
+		separate_pos_var[flag][pos] = variant_list[i];
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+		if (ref_sequence.length() != alt_sequences[0].length())
+			separate_contians_indel[flag] = true;
+		if (variant_list[i].multi_alts) {
+			if (ref_sequence.length() != alt_sequences[1].length()) {
+				separate_contians_indel[flag] = true;
+			}
+		}
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length());
+	if (!separate_contians_indel[0] && !separate_contians_indel[1]) {
+		// There is no way that there will be a match
+		return false;
+	}
+	if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+		return false;
+	}
+	string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+	int offset = min_pos;
+	// 0 for ref, 1 for query, same as flag
+    map<int, int> choices[4];
+	for(int i = 0; i < 2; i++){
+        for(int j = 0; j < 2; j++){
+            for(auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it){
+                auto pos = it->first;
+                choices[i*2+j][pos] = -1;
+            }
+        }
+    }
+	map<int, int> max_matches[4];
+	string max_paths[2];
+	int max_score = 0;
+	RecurrentMatchWithIndel(variant_list,
+		subsequence,
+		offset,
+		0,
+		separate_pos_var,
+		choices,
+		max_matches,
+		max_score,
+		max_paths);
+	if (max_score == 0) {
+		return false;
+	}
+	// matched, print out matches
+	bool multiple_match = true;
+	if (CompareSequence(max_paths[1], subsequence) || CompareSequence(max_paths[1], max_paths[0])) {
+		multiple_match = false;
+	}
+	string alt_record = max_paths[0];
+	if (multiple_match)
+		alt_record += "/" + max_paths[1];
+	string match_record = chromosome_name + "\t" + to_string(offset) + "\t" + subsequence + "\t" + alt_record;
+	string vcf_record[2] = { "" };
+	string phase_record[4] = { "" };
+	map<int, bool> separate_pos_matched[2];
+	for (int i = 0; i < 2; i++) {
+		for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+			separate_pos_matched[i][it->first] = false;
+		}
+	}
+	for(int i = 0; i < 2; i++){
+		// i= 0 ref, =1 alt
+	    for(int j = 0; j < 2; j++){
+	        auto c = max_matches[i*2+j];
+	        for(auto it = c.begin(); it !=c.end(); ++it){
+				if (it->second > 0) {
+					separate_pos_matched[i][it->first] = true;
+				}
+	        }
+	    }
+	}
+	for (auto it = separate_pos_matched[0].begin(); it != separate_pos_matched[0].end(); ++it) {
+		if (it->second) {
+			complex_ref_match_num[thread_index] ++;
+		}
+	}
+	for (auto it = separate_pos_matched[1].begin(); it != separate_pos_matched[1].end(); ++it) {
+		if (it->second) {
+			complex_que_match_num[thread_index] ++;
+		}
+	}
+	for (int i = 0; i < 2; i++) {
+		auto final_iter = separate_pos_matched[i].end();
+		--final_iter;
+		for (auto it = separate_pos_matched[i].begin(); it != separate_pos_matched[i].end(); ++it) {
+			if (it->second) {
+				int pos = it->first;
+				DiploidVariant variant = separate_pos_var[i][pos];
+				string alt1_string = variant.alts[0];
+				if (variant.multi_alts) {
+					alt1_string = variant.alts[1];
+				}
+				else if(! variant.heterozygous) {
+					alt1_string = variant.ref;
+				}
+				string variant_record = to_string(pos) + "," + variant.ref + "," + variant.alts[0];
+				if (multiple_match)
+					variant_record += "/" + alt1_string;
+				vcf_record[i] += variant_record;
+				//cout << pos << ":" << max_matches[i*2+1][pos] << endl;
+				phase_record[i * 2] += to_string(max_matches[i * 2][pos]);
+				phase_record[i * 2 + 1] += to_string(max_matches[i * 2 + 1][pos]);
+				if (it != final_iter) {
+					vcf_record[i] += ";";
+					phase_record[i * 2] += ",";
+					phase_record[i * 2 + 1] += ",";
+				}
+			}
+		}
+	}
+	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+	if (multiple_match) {
+		match_record += "\t" + phase_record[0] + "/" + phase_record[1] + "\t" + phase_record[2] + "/" + phase_record[3];
+	}
+	else {
+		match_record += "\t.\t.";
+	}
+	match_record += "\t" + to_string(max_score) + "\n";
+	cout << match_record ;
+	for (int i = 0; i < 2; i++)
+	{
+		if (i == 0) {
+			cout << "ref: ";
+		}
+		else {
+			cout << "alt: ";
+		}
+		cout << separate_pos_var[i].size() << endl;
+		for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+			auto v = it->second;
+			cout << v.pos << "," << v.ref << "," << v.alts[0];
+			if (v.multi_alts) {
+				cout << v.alts[1];
+			}
+			cout << ";";
+		}
+		cout << endl;
+	}
+	cout << endl;
+	complex_match_records[thread_index]->push_back(match_record);
+    return true;
+void DiploidVCF::RecurrentMatchWithIndel(vector<DiploidVariant> & variant_list,
+	const string subsequence,
+	const int offset,
+	int index,
+	map<int, DiploidVariant> separate_pos_var [],
+	map<int, int> choices [], // 4 vectors
+	map<int, int> max_matches[],  // 4 vectors
+	int & max_score,
+	string max_paths[]) {
+	string cur_paths[2];
+    int prefix_match = CheckPrefix(subsequence, offset, separate_pos_var, choices, cur_paths);
+    if (prefix_match < 0) return;
+	// if prefix_match == 0, just prefix match
+	if (prefix_match > 0) { // sequence direct match
+		int score = prefix_match;
+		if (max_score < score) {
+			//cout << "higher score: " << score << endl;
+            max_score = score;
+			for (int i = 0; i < 4; i++) {
+				max_matches[i] = choices[i];
+			}
+			for (int i = 0; i < 2; i++) {
+				max_paths[i] = cur_paths[i];
+			}
+		}
+	}
+	if (index >= variant_list.size()) return;
+	auto variant = variant_list[index];
+	int flag = variant.flag;
+	int pos = variant.pos;
+	int choice_end = 1;
+	if (variant.multi_alts) choice_end = 2;
+	for (int choice = 0; choice <= choice_end; choice++) {
+		if(pos == 0){
+            dout << "error pos = 0 " << endl;
+        }
+        choices[flag * 2][pos] = choice;
+		if (choice == 0) {
+			choices[flag * 2 + 1][pos] = 0;
+		}
+		else if (choice == 1) { // include
+			if (variant.multi_alts) { // if multi_alts, then the other alleles should be included
+				choices[flag * 2 + 1][pos] = 2;
+			}
+			else if (variant.heterozygous) { // if heterozygous but not multi_alts, then reference should be included
+				choices[flag * 2 + 1][pos] = 0;
+			}
+			else { // homozygous one
+				choices[flag * 2 + 1][pos] = 1;
+			}
+		}
+		else {
+			choices[flag * 2 + 1][pos] = 1;
+		}
+		RecurrentMatchWithIndel(variant_list,
+			subsequence,
+			offset,
+			index + 1,
+			separate_pos_var,
+			choices,
+			max_matches,
+			max_score,
+			max_paths);
+		choices[flag * 2][pos] = -1;
+		choices[flag * 2 + 1][pos] = -1;
+	}
+// check if prefix match or equal
+int DiploidVCF::CheckPrefix(const string subsequence,
+	const int offset,
+	map<int, DiploidVariant> separate_pos_var[],
+	map<int, int> choices[],
+	string cur_paths[])
+	string paths[4] = { "" }; // 0 and 1 are ref, 2 and 3 are query path
+	// create 4 paths
+	for (int i = 0; i < 2; i++) {
+		// create
+		for (int j = 0; j < 2; j++) {
+			int index = i*2 + j;
+			map<int, int> pos_choice = choices[index];
+			string path = "";
+			int start_pos = 0;
+			auto it = pos_choice.begin();
+            for (; it != pos_choice.end(); ++it) {
+				int pos = it->first;
+				int choice = it->second;
+				auto variant = separate_pos_var[i][pos];
+				string ref = variant.ref;
+				auto alts = variant.alts;
+				int offset_pos = pos - offset;
+				if (offset_pos < start_pos) {
+					//return -1;
+				}
+				else if (offset_pos > start_pos) {
+					path += subsequence.substr(start_pos, offset_pos - start_pos);
+				}
+				if(choice < 0)
+                    break;
+				if (choice == 0) {
+					path += ref;
+				}
+				else if (choice == 1) {
+					path += alts[0];
+				}
+				else {
+					path += alts[1];
+				}
+                start_pos = max(start_pos, offset_pos + (int)ref.length());
+			}
+            if(it == pos_choice.end()){
+                if(start_pos < subsequence.length()){
+                    path += subsequence.substr(start_pos, subsequence.length()-start_pos);
+                }
+            }
+			paths[index] = path;
+		}
+	}
+	// check prefix match
+	int const comb[2][4] = {
+		{1,3,2,4},
+		{1,4,2,3}
+	};
+	bool prefix_match = false;
+	bool direct_match = false;
+	for (int i = 0; i < 2; i++) {
+		bool check_prefix_match[2] = { false };
+		bool check_direct_match[2] = { false };
+		for (int k = 0; k < 2; k++) {
+			string s1 = paths[comb[i][k * 2]-1];
+			string s2 = paths[comb[i][k * 2 + 1]-1];
+            int min_len = min(s1.length(), s2.length());
+			string s1_sub = s1.substr(0, min_len);
+			string s2_sub = s2.substr(0, min_len);
+			check_prefix_match[k] = CompareSequence(s1_sub, s2_sub);
+			check_direct_match[k] = CompareSequence(s1, s2);
+		}
+		if (check_prefix_match[0] && check_prefix_match[1])
+			prefix_match = true;
+		if (check_direct_match[0] && check_direct_match[1])
+			direct_match = true;
+	}
+	if (direct_match) {
+        for(int i = 0; i < 4; i++){
+            dout << paths[i] << endl;
+        }
+        dout << endl;
+		int score = 0;
+		for (int i = 0; i < 2; i++) {
+			cur_paths[i] = paths[i];
+			auto pos_var = separate_pos_var[i];
+			for (auto it = pos_var.begin(); it != pos_var.end(); ++it) {
+				if(choices[i*2][it->first] <= 0 && choices[i*2+1][it->first] <= 0){
+                    continue;
+                }
+                if (scoring_basepair) {
+					score += it->second.ref.length();
+				}
+				else {
+					score += 1;
+				}
+			}
+		}
+		return score;
+	}
+	if (prefix_match) return 0;
+    return -1;
+// code reviewed by Channing 4/3/2016
+vector<vector<vector<int>>> DiploidVCF::Combine(vector<int> & positions, vector<bool> & multi_indicators, int k) {
+	vector<vector<int>> sol;
+	vector<vector<vector<int>>> all_sol;
+	if (k == 0 || k > positions.size()) {
+		return all_sol;
+	}
+	FindComb(positions,
+		multi_indicators,
+		0,
+		k,
+		sol,
+		all_sol);
+	return all_sol;
+// code review by Channing 4/3/2016
+// [TODO] unit test
+void DiploidVCF::FindComb(vector<int> & positions,
+	vector<bool> & multi_indicators,
+	int start,
+	int k,
+	vector<vector<int> > & sol,
+	vector<vector<vector<int>>> & all_sol)
+	if (k == 0) {
+		all_sol.push_back(sol);
+		return;
+	}
+	int n = positions.size();
+	for (int i = start; i <= n - k; i++) {
+		sol.push_back(vector<int>({ positions[i], 0 }));
+		FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+		sol.pop_back();
+		if (multi_indicators[i]) { // try second allele
+			sol.push_back(vector<int>({ positions[i], 1 }));
+			FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+			sol.pop_back();
+		}
+	}
+// code reviewed by Chen on 4/4/2016
+bool DiploidVCF::VariantMatch(vector<DiploidVariant> & variant_list, int thread_index) {
+    if(variant_list.size() <= 1) return false;
+	sort(variant_list.begin(), variant_list.end());
+	map<int, DiploidVariant> separate_pos_var[2];
+	// separate into ref and que
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+		int pos = variant_list[i].pos;
+		separate_pos_var[flag][pos] = variant_list[i];
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+        //dout << pos << "," << ref_sequence << "," << alt_sequences[0] << "," << flag << endl;
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length());
+	if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+		return false;
+	}
+	string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+	int offset = min_pos;
+	vector<vector<int>> max_choices[4]; // -1 for ref, 0 for alts[0], 1 for alts[1] (only applied to multi_alts)
+	string max_paths[2];
+	int max_score = 0;
+	bool max_heterozygosity = false;
+    FindBestDiploidMatch(variant_list,
+                  subsequence,
+                  offset,
+                  0,
+                  separate_pos_var,
+                  max_choices,
+                  max_score,
+                  max_heterozygosity,
+                  max_paths);
+	if (max_score == 0) {
+		return false;
+	}
+	// matched, print out matches
+	bool multiple_match = max_heterozygosity;
+	if(! match_genotype) multiple_match = false;
+    vector<string> alt_list;
+    alt_list.push_back(max_paths[0]);
+    if(multiple_match)
+        alt_list.push_back(max_paths[1]);
+	DiploidVariant dv(offset, subsequence, alt_list, true, multiple_match);
+	//NormalizeDiploidVariant(dv);
+	string alt_record = dv.alts[0];
+	if (multiple_match)
+		alt_record += "/" + dv.alts[1];
+	string match_record = chromosome_name + "\t" + to_string(dv.pos+1) + "\t" + dv.ref + "\t" + alt_record;
+	string vcf_record[2] = { "" };
+	string phase_record[4] = { "" };
+	complex_ref_match_num[thread_index] += max_choices[0].size();
+	complex_que_match_num[thread_index] += max_choices[2].size();
+	for (int i = 0; i < 2; i++) {
+		auto final_iter = max_choices[i*2].size()-1;
+		for (int k = 0; k < max_choices[i*2].size(); k++) {
+            int pos = max_choices[i*2][k][0];
+            DiploidVariant variant = separate_pos_var[i][pos];
+            string alt1_string = variant.alts[0];
+            if (variant.multi_alts) {
+                alt1_string = variant.alts[1];
+            }
+            else if (variant.heterozygous) {
+                alt1_string = variant.ref;
+            }
+            string variant_record = to_string(pos+1) + "," + variant.ref + "," + variant.alts[0];
+            if (multiple_match)
+                variant_record += "/" + alt1_string;
+            vcf_record[i] += variant_record;
+            //cout << pos << ":" << max_matches[i*2+1][pos] << endl;
+            if(multiple_match){
+                phase_record[i * 2] += to_string(max_choices[i*2][k][1]+1);
+                phase_record[i * 2 + 1] += to_string(max_choices[i * 2 + 1][k][1]+1);
+            }
+            if (k != final_iter) {
+                vcf_record[i] += ";";
+                if(multiple_match){
+                    phase_record[i * 2] += ",";
+                    phase_record[i * 2 + 1] += ",";
+                }
+            }
+		}
+	}
+	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+	if (multiple_match) {
+		match_record += "\t" + phase_record[0] + "/" + phase_record[1] + "\t" + phase_record[2] + "/" + phase_record[3];
+	}
+	else {
+		match_record += "\t.\t.";
+	}
+	match_record += "\t" + to_string(max_score) + "\n";
+	complex_match_records[thread_index]->push_back(match_record);
+	return true;
+void PrintSelection(VariantSelection selection){
+    cout << "$ Selection: $" << endl;
+    cout << "\t genome position:" << selection.genome_position[0] << "," << selection.genome_position[1] << endl;
+    for(int i = 0; i < 2; i++){
+        for(int k =0; k < selection.pos_vectors[i].size(); k++){
+            cout << "\t" << selection.pos_vectors[i][k] << ":" << selection.phasing_vectors[i][k] << "," ;
+        }
+        cout << endl;
+    }
+    for(int i = 0; i < 4; i++){
+        cout << selection.donor_sequences[i] << "," ;
+    }
+    cout << endl;
+void DiploidVCF::PrintVariant(DiploidVariant var){
+    cout << "-Variant:-" << endl;
+    cout << var.flag << "," << var.pos << "," << var.ref << "," << var.alts[0];
+    if(var.multi_alts) cout << "/" << var.alts[1];
+    cout << endl;
+void PrintSelectionsList(list<VariantSelection> variant_selections){
+    cout << "==========Selections List==================" <<endl;
+    cout << variant_selections.size() << endl;
+    for(auto it = variant_selections.begin(); it!= variant_selections.end(); ++it){
+        VariantSelection selection = *it;
+        PrintSelection(selection);
+    }
+// code review by Chen on 04/15/2016 and unit test
+// if time consuming, change to the same algorithm as RTG
+int DiploidVCF::CheckDonorSequences(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      string donor_sequences[]){
+    // if score == 0, do not bother to collapse
+    //if(selection.score == 0) return -1;
+    // so here the new donor checking algorithm does not make sense
+    // haplotype indicates the haplotype used in D_0
+    // the other haplotype need to calculate
+    // haplotype == -1, all add ref
+    // haplotype == 0, D_0 add alts[0], D_1 add alts[1] if multi_alts, add ref if heterozygous, add alts[0] otherwise
+    // haplotype == 1, D_0 add alts[1] if multi_alts, add ref otherwise, D_1 add alts[0]
+    // first, decide substr of genome sequence that be applied
+    // genome sequence that is
+    int genome_position[2] = {-1, -1};
+    int cut_length[2] = {-1, -1};
+    int pos_lower_bound[2] = {-1, -1}; // exclusive
+    int pos_upper_bound[2] = {-1, -1}; // exclusive
+    int variant_num[2];
+    for(int i = 0; i < 2; i++){
+        variant_num[i] = (int)selection.phasing_vectors[i].size();
+        if(variant_num[i] == 0){
+            pos_lower_bound[i] = -1;
+        }else{
+            DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+            pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+        }
+        if(variant_num[i] < separate_var_list[i].size()){
+            pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+        }else{
+            if(selection.separate_score[i] == 0){
+                return -1;
+            }
+            pos_upper_bound[i] = (int)subsequence.length();
+        }
+    }
+    if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+        genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+        genome_position[1] = genome_position[0];
+    }else{
+        genome_position[0] = pos_upper_bound[0];
+        genome_position[1] = pos_upper_bound[1];
+    }
+    cut_length[0] = subsequence.length() - genome_position[0];
+    cut_length[1] = subsequence.length() - genome_position[1];
+    // here first decide reference sequence for apply
+    for(int i = 0; i < 2; i++){
+        donor_sequences[i*2] = subsequence;
+        donor_sequences[i*2+1] = subsequence;
+    }
+    for(int i = 0; i < 2; i++){
+        for(int k = (int)selection.phasing_vectors[i].size() - 1; k >= 0; k--){
+            int temp_phasing = selection.phasing_vectors[i][k];
+            if(temp_phasing == -1){
+                continue;
+            }
+            DiploidVariant temp_var = separate_var_list[i][k];
+            int temp_pos = temp_var.pos;
+            int temp_end = temp_pos + temp_var.ref.length();
+            int relative_end = temp_end - offset;
+            int relative_start = temp_pos - offset;
+            if(relative_start < 0 || relative_end > donor_sequences[i*2].length() || relative_end > donor_sequences[i*2+1].length()){
+                //dout << "overlapping variants" << endl;
+                return -1;
+            }
+            string one_alt = "";
+            string other_alt = "";
+            string var_ref = temp_var.ref;
+            if(temp_phasing == 0){
+                one_alt = temp_var.alts[0];
+                if(temp_var.multi_alts){
+                    other_alt = temp_var.alts[1];
+                }else if(temp_var.heterozygous){
+                    other_alt = var_ref;
+                }else{
+                    other_alt = one_alt;
+                }
+            }else{
+                if(temp_var.multi_alts){
+                    one_alt = temp_var.alts[1];
+                }else{
+                    one_alt = var_ref;
+                }
+                other_alt = temp_var.alts[0];
+            }
+            string t_sequence = donor_sequences[i*2];
+            string pre_string = t_sequence.substr(0, relative_start);
+            string post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+            donor_sequences[i*2] = pre_string + one_alt + post_string;
+            t_sequence = donor_sequences[i*2+1];
+            pre_string = t_sequence.substr(0, relative_start);
+            post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+            donor_sequences[i*2+1] = pre_string + other_alt + post_string;
+//            cout << ":::::::" << endl;
+//            cout << subsequence << ", " << offset << endl;
+//            PrintVariant(temp_var);
+//            cout << relative_start << "," << relative_end << endl;
+//            cout << donor_sequences[i*2] << endl;
+//            cout << donor_sequences[i*2+1] << endl;
+        }
+//        cout << pos_lower_bound[i] << "," << pos_upper_bound[i] << "," ;
+//        cout << genome_position[i] << "," << cut_length[i] << endl;
+    }
+    for(int i = 0; i < 2; i++){
+//        cout << "&&&&&" << genome_position[i] << "," << cut_length[i] << endl;
+        if(cut_length[i] < (int)subsequence.length()){
+            donor_sequences[i*2] = donor_sequences[i*2].substr(0, donor_sequences[i*2].length() - cut_length[i]);
+            donor_sequences[i*2+1] = donor_sequences[i*2+1].substr(0, donor_sequences[i*2+1].length() - cut_length[i]);
+        }else{
+            donor_sequences[i*2] = "";
+            donor_sequences[i*2+1] = "";
+        }
+        if(genome_position[i] < 0) genome_position[i] = -1;
+    }
+    selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+//    cout << "after apply Selection:" << endl;
+//    cout << donor_sequences[0] << endl;
+//    cout << donor_sequences[1] << endl;
+//    cout << donor_sequences[2] << endl;
+//    cout << donor_sequences[3] << endl;
+    bool donor_match = false;
+    if(donor_sequences[0] == donor_sequences[2] && donor_sequences[1] == donor_sequences[3]){
+        donor_match = true;
+        selection.haplotypes_consistent = true;
+    }else if(donor_sequences[0] == donor_sequences[3] && donor_sequences[1] == donor_sequences[2]){
+        donor_match = true;
+        selection.haplotypes_consistent = true;
+    }
+    for(int i = 0; i < 2; i++){
+        selection.genome_position[i] = genome_position[i];
+        selection.donor_length[i] = donor_sequences[i].length();
+    }
+    if(! donor_match){
+        if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()) return -1;
+        selection.haplotypes_consistent = false;
+        bool prefix_match = false;
+        if(PrefixMatch(donor_sequences[0], donor_sequences[2]) && PrefixMatch(donor_sequences[1], donor_sequences[3])){
+            prefix_match = true;
+        }else if(PrefixMatch(donor_sequences[0], donor_sequences[3]) && PrefixMatch(donor_sequences[1], donor_sequences[2])){
+            prefix_match = true;
+        }
+        if(prefix_match){
+            return 1;
+        }else{
+            return -1;
+        }
+    }
+    if(genome_position[0]!=genome_position[1]) return 1;
+    if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()){
+        // achieve whole genome
+        return 3;
+    }
+    // cut only when not reach the end
+    // set min_donor_length
+    // set need_variant = true, because you did not use up all variants
+    return 2;
+// code review by Chen on 04/15/2016 and unit test
+// if time consuming, change to the same algorithm as RTG
+int DiploidVCF::CheckDonorSequencesWithOverlap(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      string donor_sequences[]){
+    // if score == 0, do not bother to collapse
+    //if(selection.score == 0) return -1;
+    // so here the new donor checking algorithm does not make sense
+    // haplotype indicates the haplotype used in D_0
+    // the other haplotype need to calculate
+    // haplotype == -1, all add ref
+    // haplotype == 0, D_0 add alts[0], D_1 add alts[1] if multi_alts, add ref if heterozygous, add alts[0] otherwise
+    // haplotype == 1, D_0 add alts[1] if multi_alts, add ref otherwise, D_1 add alts[0]
+    // first, decide substr of genome sequence that be applied
+    // genome sequence that is
+    int genome_position[2] = {-1, -1};
+    int cut_length[2] = {-1, -1};
+    int pos_lower_bound[2] = {-1, -1}; // exclusive
+    int pos_upper_bound[2] = {-1, -1}; // exclusive
+    int variant_num[2];
+    // do not calculate lower bound
+    for(int i = 0; i < 2; i++){
+        variant_num[i] = (int)selection.phasing_vectors[i].size();
+        if(variant_num[i] == 0){
+            pos_lower_bound[i] = -1;
+        }else{
+            DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+            pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+        }
+        if(variant_num[i] < separate_var_list[i].size()){
+            pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+        }else{
+            if(selection.separate_score[i] == 0){
+                return -1;
+            }
+            pos_upper_bound[i] = (int)subsequence.length();
+        }
+    }
+    // here first decide reference sequence for apply
+    for(int i = 0; i < 2; i++){
+        donor_sequences[i*2] = subsequence;
+        donor_sequences[i*2+1] = subsequence;
+    }
+    for(int i = 0; i < 2; i++){
+            DiploidVariant pre_var;
+        for(int k = (int)selection.phasing_vectors[i].size() - 1; k >= 0; k--){
+            int temp_phasing = selection.phasing_vectors[i][k];
+            if(temp_phasing == -1){
+                continue;
+            }
+            DiploidVariant temp_var = separate_var_list[i][k];
+            if(temp_var.pos = pre_var.pos && temp_var.ref == pre_var.ref) return -1; // can not change the same sequence twice
+            int temp_pos = temp_var.pos;
+            int temp_end = temp_pos + temp_var.ref.length();
+            pos_lower_bound[i] = max(pos_lower_bound[i], temp_end);
+            int relative_end = temp_end - offset;
+            int relative_start = temp_pos - offset;
+            if(relative_start < 0 || relative_end > donor_sequences[i*2].length() || relative_end > donor_sequences[i*2+1].length()){
+                //dout << "overlapping variants" << endl;
+                return -1;
+            }
+            string one_alt = "";
+            string other_alt = "";
+            string var_ref = temp_var.ref;
+            if(temp_phasing == 0){
+                one_alt = temp_var.alts[0];
+                if(temp_var.multi_alts){
+                    other_alt = temp_var.alts[1];
+                }else if(temp_var.heterozygous){
+                    other_alt = var_ref;
+                }else{
+                    other_alt = one_alt;
+                }
+            }else{
+                if(temp_var.multi_alts){
+                    one_alt = temp_var.alts[1];
+                }else{
+                    one_alt = var_ref;
+                }
+                other_alt = temp_var.alts[0];
+            }
+            string t_sequence = donor_sequences[i*2];
+            string pre_string = t_sequence.substr(0, relative_start);
+            string post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+            donor_sequences[i*2] = pre_string + one_alt + post_string;
+            t_sequence = donor_sequences[i*2+1];
+            pre_string = t_sequence.substr(0, relative_start);
+            post_string = t_sequence.substr(relative_end, t_sequence.length() - relative_end);
+            donor_sequences[i*2+1] = pre_string + other_alt + post_string;
+            pre_var = temp_var;
+        }
+//        cout << pos_lower_bound[i] << "," << pos_upper_bound[i] << "," ;
+//        cout << genome_position[i] << "," << cut_length[i] << endl;
+    }
+    if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+        genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+        genome_position[1] = genome_position[0];
+    }else{
+        genome_position[0] = pos_upper_bound[0];
+        genome_position[1] = pos_upper_bound[1];
+    }
+    cut_length[0] = subsequence.length() - genome_position[0];
+    cut_length[1] = subsequence.length() - genome_position[1];
+    for(int i = 0; i < 2; i++){
+//        cout << "&&&&&" << genome_position[i] << "," << cut_length[i] << endl;
+        if(cut_length[i] < (int)subsequence.length()){
+            donor_sequences[i*2] = donor_sequences[i*2].substr(0, donor_sequences[i*2].length() - cut_length[i]);
+            donor_sequences[i*2+1] = donor_sequences[i*2+1].substr(0, donor_sequences[i*2+1].length() - cut_length[i]);
+        }else{
+            donor_sequences[i*2] = "";
+            donor_sequences[i*2+1] = "";
+        }
+        if(genome_position[i] < 0) genome_position[i] = -1;
+    }
+    selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+//    cout << "after apply Selection:" << endl;
+//    cout << donor_sequences[0] << endl;
+//    cout << donor_sequences[1] << endl;
+//    cout << donor_sequences[2] << endl;
+//    cout << donor_sequences[3] << endl;
+    bool donor_match = false;
+    if(donor_sequences[0] == donor_sequences[2] && donor_sequences[1] == donor_sequences[3]){
+        donor_match = true;
+        selection.haplotypes_consistent = true;
+    }else if(donor_sequences[0] == donor_sequences[3] && donor_sequences[1] == donor_sequences[2]){
+        donor_match = true;
+        selection.haplotypes_consistent = true;
+    }
+    for(int i = 0; i < 2; i++){
+        selection.genome_position[i] = genome_position[i];
+        selection.donor_length[i] = donor_sequences[i].length();
+    }
+    if(! donor_match){
+        if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()) return -1;
+        selection.haplotypes_consistent = false;
+        bool prefix_match = false;
+        if(PrefixMatch(donor_sequences[0], donor_sequences[2]) && PrefixMatch(donor_sequences[1], donor_sequences[3])){
+            prefix_match = true;
+        }else if(PrefixMatch(donor_sequences[0], donor_sequences[3]) && PrefixMatch(donor_sequences[1], donor_sequences[2])){
+            prefix_match = true;
+        }
+        if(prefix_match){
+            return 1;
+        }else{
+            return -1;
+        }
+    }
+    if(genome_position[0]!=genome_position[1]) return 1;
+    if(variant_num[0] == separate_var_list[0].size() && variant_num[1] == separate_var_list[1].size()){
+        // achieve whole genome
+        return 3;
+    }
+    // cut only when not reach the end
+    // set min_donor_length
+    // set need_variant = true, because you did not use up all variants
+    return 2;
+int DiploidVCF::ExtendingDonorSequences(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      int flag){
+    int genome_position[2] = {0, 0};
+    int pos_lower_bound[2] = {0, 0}; // exclusive
+    int pos_upper_bound[2] = {0, 0}; // exclusive
+    int variant_num[2];
+    bool consider_all_variants = true;
+    for(int i = 0; i < 2; i++){
+        variant_num[i] = (int)selection.phasing_vectors[i].size();
+        if(variant_num[i] == 0){
+            pos_lower_bound[i] = 0;
+        }else{
+            DiploidVariant lower_variant = separate_var_list[i][variant_num[i]-1];
+            pos_lower_bound[i] = (lower_variant.pos - offset) + lower_variant.ref.length();
+        }
+        if(variant_num[i] < separate_var_list[i].size()){
+            consider_all_variants = false;
+            pos_upper_bound[i] = separate_var_list[i][variant_num[i]].pos - offset;
+        }else{
+            if(selection.separate_score[i] == 0){
+                return -1;
+            }
+            pos_upper_bound[i] = (int)subsequence.length();
+        }
+        //if(pos_upper_bound[i] < pos_lower_bound[i]) pos_upper_bound[i] = pos_lower_bound[i];
+//        dout << i << " lower bound:" << pos_lower_bound[i] << endl;
+//        dout << i << " upper bound:" << pos_upper_bound[i] << endl;
+    }
+    if(min(pos_upper_bound[0], pos_upper_bound[1]) - max(pos_lower_bound[0], pos_lower_bound[1]) >= 0){
+        genome_position[0] = min(pos_upper_bound[0], pos_upper_bound[1]);
+        genome_position[1] = genome_position[0];
+    }else{
+        genome_position[0] = pos_upper_bound[0];
+        genome_position[1] = pos_upper_bound[1];
+    }
+    for(int i = 0; i < 2; i++){
+        // also consider overlap variants here
+        int pre_start = selection.genome_position[i];
+        if(i!=flag){
+            if(pre_start == genome_position[i]) continue;
+            if(pre_start > genome_position[i]){
+                int cut_len = pre_start - genome_position[i];
+                selection.donor_sequences[i*2] = selection.donor_sequences[i*2].substr(0, selection.donor_sequences[i*2].length()-cut_len);
+                selection.donor_sequences[i*2+1] = selection.donor_sequences[i*2+1].substr(0, selection.donor_sequences[i*2+1].length()-cut_len);
+            }else{
+                string post_s = subsequence.substr(pre_start, genome_position[i]-pre_start);
+                selection.donor_sequences[i*2] += post_s;
+                selection.donor_sequences[i*2+1] += post_s;
+            }
+            selection.genome_position[i] = genome_position[i];
+        }else{
+            int last_i = variant_num[i]-1;
+            DiploidVariant last_v = separate_var_list[i][last_i];
+            int last_phase = selection.phasing_vectors[i][last_i];
+            int pre_end = last_v.pos - offset;
+            int post_start = pre_end + last_v.ref.length();
+            if(pre_end < pre_start){
+                dout << "error when extend donor sequence" << endl;
+                return -1;
+            }
+            int post_end = genome_position[i];
+            if(post_end < post_start){
+                selection.overlap_detected = true;
+                genome_position[i] = post_start;
+                post_end = post_start;
+            }
+            string var_ref = last_v.ref;
+            string one_alt = var_ref;
+            string other_alt = var_ref;
+            if(last_phase == 0){
+                one_alt = last_v.alts[0];
+                if(last_v.multi_alts){
+                    other_alt = last_v.alts[1];
+                }else if(!last_v.heterozygous){
+                    other_alt = one_alt;
+                }
+            }else if(last_phase == 1){
+                if(last_v.multi_alts){
+                    one_alt = last_v.alts[1];
+                }
+                other_alt = last_v.alts[0];
+            }
+            string pre_string = subsequence.substr(pre_start, pre_end-pre_start);
+            string post_string = subsequence.substr(post_start, post_end - post_start);
+            selection.donor_sequences[i*2] += pre_string + one_alt + post_string;
+            selection.donor_sequences[i*2+1] += pre_string + other_alt + post_string;
+            selection.genome_position[i] = genome_position[i];
+        }
+    }
+    bool same_genome_position = false;
+    if(genome_position[0]==genome_position[1]) same_genome_position = true;
+    if(same_genome_position){
+        selection.min_genome_pos = genome_position[0];
+    }else{
+        selection.min_genome_pos = min(genome_position[0], genome_position[1]);
+    }
+    for(int i = 0; i < 2; i++){
+        selection.donor_length[i] = selection.donor_sequences[i].length();
+    }
+    bool donor_match = false;
+    if(same_genome_position){
+        if(selection.donor_sequences[0] == selection.donor_sequences[2] && selection.donor_sequences[1] == selection.donor_sequences[3]){
+            donor_match = true;
+            selection.haplotypes_consistent = true;
+        }
+        else if(selection.donor_sequences[0] == selection.donor_sequences[3] && selection.donor_sequences[1] == selection.donor_sequences[2]){
+            donor_match = true;
+            selection.haplotypes_consistent = true;
+        }
+    }
+    // matching prefix is actually not necessary, we can postpone until we get the same sequence length
+    if(! donor_match){
+        if(consider_all_variants) return -1;
+        selection.haplotypes_consistent = false;
+        bool prefix_match = false;
+        if(PrefixMatch(selection.donor_sequences[0], selection.donor_sequences[2]) && PrefixMatch(selection.donor_sequences[1], selection.donor_sequences[3])){
+            prefix_match = true;
+        }
+        else if(PrefixMatch(selection.donor_sequences[0], selection.donor_sequences[3]) && PrefixMatch(selection.donor_sequences[1], selection.donor_sequences[2])){
+            prefix_match = true;
+        }
+        if(prefix_match){
+//            if(same_genome_position){
+//                return 4;
+//            }
+            return 1;
+        }else{
+            return -1;
+        }
+    }
+    if(consider_all_variants){
+        return 3;
+    }
+    return 2;
+// code review by Chen on 04/15/2016
+// [TODO] unit test
+// selection should pass by value
+// return if insert or not
+bool DiploidVCF::AddVariantToSelection(list<VariantSelection> & variant_selections,
+                                       VariantSelection selection,
+                                       DiploidVariant variant,
+                                       int haplotype,
+                                       vector<DiploidVariant> separate_var_list[],
+                                       const string & subsequence,
+                                       int offset,
+                                       VariantSelection & best_selection){
+    // create a new variant by adding variant and haplotype into selection
+    // call this function because new variants are add in but not evaluate
+//    cout << "add variant ";
+//    PrintVariant(variant);
+//    cout << "with haplotype: " << haplotype ;
+//    cout << "into selection" ;
+//    PrintSelection(selection);
+    int flag = variant.flag;
+    int variant_pos = variant.pos;
+    selection.pos_vectors[flag].push_back(variant_pos);
+    selection.phasing_vectors[flag].push_back(haplotype);
+    // $ did not add this function to VariantSelection to reduce memory usage
+    // set selection.need_variant = false, add it directly into list
+    if(haplotype != -1){
+        selection.score++;
+        selection.separate_score[flag] ++;
+    }else{
+        flag = -1;
+    }
+    // insert in the order of min donor length
+    int consistent_state = 0;
+    //check overlap
+    if(selection.overlap_detected){
+        //naive way of checking overlaps
+//        for(int i = 0; i < 2; i++){
+//            int largest_pos = 0;
+//            DiploidVariant largest_var;
+//            for(int k = 0; k < selection.phasing_vectors[i].size(); k++){
+//                int phasing = selection.phasing_vectors[i][k];
+//                if(phasing == -1) continue;
+//                DiploidVariant var = separate_var_list[i][k];
+//                int var_end = var.pos+var.ref.length();
+//                if(var.pos < largest_pos-3){
+//                    // two conditions
+//                    if(var.mdl != 0 || var.mil != 0){
+//                        if(largest_var.mdl != 0 || largest_var.mil != 0){
+//                            return false;
+//                        }
+//                    }
+//                    //if(var.pos = largest_var.pos) return false;
+//                }
+//                if(largest_pos < var_end){
+//                    largest_pos = var_end;
+//                    largest_var = var;
+//                }
+//            }
+//        }
+        string donor_sequences[4];
+        consistent_state = CheckDonorSequences(separate_var_list,
+                                             selection,
+                                             subsequence,
+                                             offset,
+                                             donor_sequences);
+        for(int i = 0; i < 4; i++)
+            selection.donor_sequences[i] = donor_sequences[i];
+    }else{
+        consistent_state = ExtendingDonorSequences(separate_var_list,
+                                          selection,
+                                          subsequence,
+                                          offset,
+                                          flag);
+    }
+    //PrintSelection(selection);
+    // there are 4 state:
+    // 0. not match and not prefix match, do not add, return -1
+    // 1. not match but prefix match, just add, return 1
+    // 2. match but not reach end, merge paths, all paths in list need variant, return 2
+    // 3. match and reach end, compare with best match, return 3
+    if(consistent_state <= 0) return false;
+    if(consistent_state == 1){
+//        cout << "==> prefix match: " << endl;
+//        cout << donor_sequences[0] << endl;
+//        cout << donor_sequences[1] << endl;
+//        cout << donor_sequences[2] << endl;
+//        cout << donor_sequences[3] << endl;
+//        bool inserted = false;
+//        for(auto it = variant_selections.begin(); it != variant_selections.end(); ++it){
+//            if(it->min_genome_pos > selection.min_genome_pos){
+//                variant_selections.insert(it, selection);
+//                inserted = true;
+//                break;
+//            }
+//        }
+//        if(!inserted){ // did not find a proper position to insert
+//            variant_selections.push_back(selection);
+//        }
+        auto it = upper_bound(variant_selections.begin(), variant_selections.end(), selection);
+        variant_selections.insert(it, selection);
+        return true;
+    }
+//    if(consistent_state == 4){
+//        return CollapsePrefixMatchSelection(selection, variant_selections);
+//    }
+    if(consistent_state == 2){
+//        cout << "==> report match: " << endl;
+//        cout << donor_sequences[0] << endl;
+//        cout << donor_sequences[1] << endl;
+//        cout << donor_sequences[2] << endl;
+//        cout << donor_sequences[3] << endl;
+        return CollapseSelections(selection,  // you can only collapse one selection at a time
+                        variant_selections);
+    }
+    if(consistent_state == 3){
+//        cout << "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" << endl;
+//        cout << donor_sequences[0] << endl;
+//        cout << donor_sequences[1] << endl;
+//        cout << donor_sequences[2] << endl;
+//        cout << donor_sequences[3] << endl;
+        if(selection.score > best_selection.score){
+            best_selection = selection;
+        }
+        return false;
+    }
+    return false;
+bool DiploidVCF::CollapsePrefixMatchSelection(VariantSelection selection,
+                                    list<VariantSelection> & variant_selections){
+    bool need_insert = false;
+    for(auto it = variant_selections.begin(); it != variant_selections.end(); ){
+        if(need_insert){
+            variant_selections.insert(it, selection);
+            return true;
+        }
+        VariantSelection ts = *it;
+        if(ts.min_genome_pos > selection.min_genome_pos){
+            variant_selections.insert(it, selection);
+            return true;
+        }else if(ts.min_genome_pos == selection.min_genome_pos &&
+                ts.genome_position[0] == ts.genome_position[1] && // also same genome position
+                ts.donor_sequences[0] == selection.donor_sequences[0] &&
+                ts.donor_sequences[1] == selection.donor_sequences[1] &&
+                ts.donor_sequences[2] == selection.donor_sequences[2] &&
+                ts.donor_sequences[3] == selection.donor_sequences[3] )
+        {
+            if(ts.score < selection.score){
+                it = variant_selections.erase(it);
+                need_insert = true;
+                continue;
+            }else{
+                return false;
+            }
+        }else{
+            ++it;
+        }
+    }
+    variant_selections.push_back(selection); // finally we need to insert
+    return true;
+// code review by Chen on 04/15/2016, unit test
+bool DiploidVCF::CollapseSelections(VariantSelection selection,
+                                    list<VariantSelection> & variant_selections){
+//    bool need_insert = false;
+//    for(auto it = variant_selections.begin(); it != variant_selections.end(); ){
+//        if(need_insert){
+//            variant_selections.insert(it, selection);
+//            return true;
+//        }
+//        VariantSelection ts = *it;
+//        if(ts.min_genome_pos > selection.min_genome_pos){
+//            variant_selections.insert(it, selection);
+//            return true;
+//        }else if(ts.haplotypes_consistent &&
+//                ts.genome_position[0] == selection.genome_position[0] &&
+//                ts.genome_position[1] == selection.genome_position[1] &&
+//                ( (ts.donor_length[0] == selection.donor_length[0] && ts.donor_length[1] == selection.donor_length[1]) ||
+//                  (ts.donor_length[1] == selection.donor_length[0] && ts.donor_length[0] == selection.donor_length[1]) ) ){
+//            if(ts.score < selection.score){
+//                it = variant_selections.erase(it);
+//                need_insert = true;
+//                continue;
+//            }else{
+//                return false;
+//            }
+//        }else{
+//            ++it;
+//        }
+//    }
+//    variant_selections.push_back(selection);
+//    return true;
+    auto lt = lower_bound(variant_selections.begin(), variant_selections.end(), selection);
+    auto rt = upper_bound(lt, variant_selections.end(), selection);
+    // lower bound is ret.first
+    // upper bound is ret.second
+    if(lt == variant_selections.end() || lt->min_genome_pos != selection.min_genome_pos){
+        variant_selections.insert(rt, selection);
+        return true;
+    }else{
+        for(auto it = lt; it!= rt;){
+            VariantSelection ts = *it;//ts represents each selection in variant_selections
+            if(ts.haplotypes_consistent &&
+                ts.genome_position[0] == selection.genome_position[0] &&
+                ts.genome_position[1] == selection.genome_position[1] &&
+                ( (ts.donor_length[0] == selection.donor_length[0] && ts.donor_length[1] == selection.donor_length[1]) ||
+                  (ts.donor_length[1] == selection.donor_length[0] && ts.donor_length[0] == selection.donor_length[1]) ) )
+            {
+                if(ts.score < selection.score){
+                    it = variant_selections.erase(it);
+                    variant_selections.insert(it, selection);
+                    return true;
+                }else{
+                    return false;
+                }
+            }else{
+                ++it;
+            }
+        }
+        // here, iterate all candidates, not found match, directly insert
+        variant_selections.insert(rt, selection);
+        return true;
+    }
+// code reviewed by Chen on 04/15/2016
+// [TODO] unit test
+bool DiploidVCF::AcceleratedVariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+    if(variant_list.size() <= 1) return false;
+    sort(variant_list.begin(), variant_list.end()); // here we need to sort
+    vector<DiploidVariant> separate_var_list[2];
+	// separate into ref and que
+	int total_mil = 0;
+	int total_mdl = 0;
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+		int pos = variant_list[i].pos;
+		separate_var_list[flag].push_back(variant_list[i]);
+		total_mil += variant_list[i].mil;
+		total_mdl += variant_list[i].mdl;
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+	if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+		return false;
+	}
+	if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            complex_ref_match_num[thread_index]++;
+            complex_que_match_num[thread_index]++;
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            complex_match_records[thread_index]->push_back(match_record);
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+	}else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+        int flag = 0;
+        if(separate_var_list[1].size() == 1) flag = 1;
+        int r_flag = 1-flag;
+        if(separate_var_list[r_flag].size() > 4){
+            int total_r_mdl = 0;
+            int total_r_mil = 0;
+            for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+                DiploidVariant var = separate_var_list[r_flag][k];
+                int var_mdl = var.mdl;
+                int var_mil = var.mil;
+                int ref_length = var.ref.length();
+                total_r_mdl += var_mdl;
+                total_r_mil += var_mil;
+            }
+            if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+        }
+	}
+	// remove singular variant
+    vector<bool> appliable_flag[2];
+    int total_change = total_mil+total_mdl;
+    for(int i = 0; i < 2; i++){
+        for(int k = 0; k < separate_var_list[i].size(); k++){
+            DiploidVariant cur_var = separate_var_list[i][k];
+            int max_change = max(cur_var.mil, cur_var.mdl);
+            if(max_change > total_change-max_change){
+                appliable_flag[i].push_back(false);
+            }else{
+                appliable_flag[i].push_back(true);
+            }
+        }
+    }
+	string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+	ToUpper(subsequence); // subsequence only contains upper char
+	int offset = min_pos;
+	int subsequence_length = max_pos - min_pos;
+	list<VariantSelection> variant_selections; // sorted by last matched donor length
+	VariantSelection best_selection;
+	VariantSelection dummy;
+    bool overlap_detected = false;
+    for(int i = 0; i < 2; i++){
+        int largest_pos = 0;
+        for(int k = 0; k < separate_var_list[i].size(); k++){
+            auto var = separate_var_list[i][k];
+            if(var.pos <= largest_pos){
+                overlap_detected = true;
+                break;
+            }
+            largest_pos = max(largest_pos, (int)(var.pos+var.ref.length()));
+        }
+        if(overlap_detected) break;
+    }
+    dummy.overlap_detected = overlap_detected;
+    variant_selections.push_back(dummy);
+    map<string, int> score_by_consistent_donor; // donor should be sorted
+    while(variant_selections.size() != 0){
+        VariantSelection current_selection = variant_selections.front();
+        variant_selections.pop_front();
+        bool get_ref_var = true;
+        int ref_var_taken = current_selection.phasing_vectors[0].size();
+        int que_var_taken = current_selection.phasing_vectors[1].size();
+        if(ref_var_taken >= separate_var_list[0].size()){
+            get_ref_var = false;
+        }else if(que_var_taken < separate_var_list[1].size()){
+              if(current_selection.genome_position[0] > current_selection.genome_position[1]){
+                get_ref_var = false;
+              }else if( current_selection.genome_position[0] == current_selection.genome_position[1]){
+                if(min(current_selection.donor_length[0], current_selection.donor_length[1]) > min(current_selection.donor_length[2], current_selection.donor_length[3])){
+                    get_ref_var = false;
+                }
+              }
+        }
+        DiploidVariant current_variant;
+        bool can_take_variant = true;
+        if(get_ref_var){
+            can_take_variant = appliable_flag[0][ref_var_taken];
+            current_variant = separate_var_list[0][ref_var_taken];
+        }else{
+            can_take_variant = appliable_flag[1][que_var_taken];
+            current_variant = separate_var_list[1][que_var_taken];
+        }
+        int current_flag = current_variant.flag;
+//            cout << "current selection" << endl;
+//            PrintSelection(current_selection);
+//            cout << "add variant";
+//            PrintVariant(current_variant);
+        bool added = false;
+        // make choose decision before not choose decision, save del times
+        if(can_take_variant){
+            added = AddVariantToSelection(variant_selections,
+                                current_selection,
+                                current_variant,
+                                0,
+                                separate_var_list,
+                                subsequence,
+                                offset,
+                                best_selection);
+    //            cout << "added state : " << added << endl;
+    //            PrintSelectionsList(variant_selections);
+            if(current_variant.heterozygous){
+                added = AddVariantToSelection(variant_selections,
+                                    current_selection,
+                                    current_variant,
+                                    1,
+                                    separate_var_list,
+                                    subsequence,
+                                    offset,
+                                    best_selection);
+    //                cout << "added state : " << added << endl;
+    //                PrintSelectionsList(variant_selections);
+            }
+        }
+       added= AddVariantToSelection(variant_selections,
+                            current_selection,
+                            current_variant,
+                            -1,
+                            separate_var_list,
+                            subsequence,
+                            offset,
+                            best_selection);
+//            cout << "added state : " << added << endl;
+//            PrintSelectionsList(variant_selections);
+    }
+//    dout << best_selection.score << endl;
+    if (best_selection.score <= 0) return false;
+//    cout << "best selection: " << endl;
+//    PrintSelection(best_selection);
+    complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+    complex_que_match_num[thread_index] += best_selection.separate_score[1];
+    bool multiple_match = true;
+    if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+//    string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+//    if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_selection.donor_sequences[0];
+    string parsimonious_alt1 = best_selection.donor_sequences[1];
+    int parsimonious_pos = NormalizeVariantSequence(offset,
+                             parsimonious_ref,
+                             parsimonious_alt0,
+                             parsimonious_alt1);
+    string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		auto final_iter = separate_var_list[i].size()-1;
+		vector<int> phasing_vector = best_selection.phasing_vectors[i];
+		for (int k = 0; k < separate_var_list[i].size(); k++) {
+            int phasing = phasing_vector[k];
+            if(phasing == -1) continue;
+            DiploidVariant variant = separate_var_list[i][k];
+            string alt_string = variant.alts[0];
+            if(variant.multi_alts){
+                alt_string += "/" + variant.alts[1];
+            }
+            string phasing_string = "";
+            if(phasing == 0){
+                phasing_string += "1";
+                if(variant.heterozygous){
+                    if(variant.multi_alts){
+                        phasing_string += "|2";
+                    }else{
+                        phasing_string += "|0";
+                    }
+                }else{
+                    phasing_string += "|1";
+                }
+            }else if(phasing == 1){
+                if(variant.multi_alts){
+                    phasing_string += "2|1";
+                }else{
+                    phasing_string += "0|1";
+                }
+            }
+            string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+            vcf_record[i] += variant_record;
+            phasing_record[i] += phasing_string;
+            if (k != final_iter) {
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+	}
+	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+    match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+	match_record += "\t" + to_string(best_selection.score) + "\n";
+	complex_match_records[thread_index]->push_back(match_record);
+    // add matching result
+    return true;
+bool DiploidVCF::VariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+    if(variant_list.size() <= 1) return false;
+    sort(variant_list.begin(), variant_list.end()); // here we need to sort
+    vector<DiploidVariant> separate_var_list[2];
+	// separate into ref and que
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+		int pos = variant_list[i].pos;
+		separate_var_list[flag].push_back(variant_list[i]);
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+	if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+		return false;
+	}
+	if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            complex_ref_match_num[thread_index]++;
+            complex_que_match_num[thread_index]++;
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = to_string(tv.pos) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            complex_match_records[thread_index]->push_back(match_record);
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+	}
+	string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+	ToUpper(subsequence); // subsequence only contains upper char
+	int offset = min_pos;
+	int subsequence_length = max_pos - min_pos;
+	list<VariantSelection> variant_selections; // sorted by last matched donor length
+	VariantSelection best_selection;
+	VariantSelection dummy;
+    variant_selections.push_back(dummy);
+    map<string, int> score_by_consistent_donor; // donor should be sorted
+    while(variant_selections.size() != 0){
+        VariantSelection current_selection = variant_selections.front();
+        variant_selections.pop_front();
+        // all variants has been evaluated, need new variant
+        int previous_var_index = current_selection.cur_var;
+        if(previous_var_index < (int)variant_list.size()-1){
+            int cur_var_index = previous_var_index + 1;
+//            cout << "consider variant: " << cur_var_index << endl;
+            DiploidVariant current_variant = variant_list[cur_var_index];
+            // update boundary of current_selection
+            current_selection.cur_var = cur_var_index;
+            int current_flag = current_variant.flag;
+//            cout << "current selection" << endl;
+//            PrintSelection(current_selection);
+//            cout << "add variant";
+//            PrintVariant(current_variant);
+            bool added = false;
+            // make choose decision before not choose decision, save del times
+            added = AddVariantToSelection(variant_selections,
+                                current_selection,
+                                current_variant,
+                                0,
+                                separate_var_list,
+                                subsequence,
+                                offset,
+                                best_selection);
+//            cout << "added state : " << added << endl;
+//            PrintSelectionsList(variant_selections);
+            if(current_variant.heterozygous){
+                added = AddVariantToSelection(variant_selections,
+                                    current_selection,
+                                    current_variant,
+                                    1,
+                                    separate_var_list,
+                                    subsequence,
+                                    offset,
+                                    best_selection);
+//                cout << "added state : " << added << endl;
+//                PrintSelectionsList(variant_selections);
+            }
+           added= AddVariantToSelection(variant_selections,
+                                current_selection,
+                                current_variant,
+                                -1,
+                                separate_var_list,
+                                subsequence,
+                                offset,
+                                best_selection);
+//            cout << "added state : " << added << endl;
+//            PrintSelectionsList(variant_selections);
+        }
+    }
+//    dout << best_selection.score << endl;
+    if (best_selection.score <= 0) return false;
+//    cout << "best selection: " << endl;
+//    PrintSelection(best_selection);
+    complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+    complex_que_match_num[thread_index] += best_selection.separate_score[1];
+    bool multiple_match = true;
+    if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+    string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+    if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		auto final_iter = separate_var_list[i].size()-1;
+		vector<int> phasing_vector = best_selection.phasing_vectors[i];
+		for (int k = 0; k < separate_var_list[i].size(); k++) {
+            int phasing = phasing_vector[k];
+            if(phasing == -1) continue;
+            DiploidVariant variant = separate_var_list[i][k];
+            string alt_string = variant.alts[0];
+            if(variant.multi_alts){
+                alt_string += "/" + variant.alts[1];
+            }
+            string phasing_string = "";
+            if(phasing == 0){
+                phasing_string += "1";
+                if(variant.heterozygous){
+                    if(variant.multi_alts){
+                        phasing_string += "|2";
+                    }else{
+                        phasing_string += "|0";
+                    }
+                }else{
+                    phasing_string += "|1";
+                }
+            }else if(phasing == 1){
+                if(variant.multi_alts){
+                    phasing_string += "2|1";
+                }else{
+                    phasing_string += "0|1";
+                }
+            }
+            string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+            vcf_record[i] += variant_record;
+            phasing_record[i] += phasing_string;
+            if (k != final_iter) {
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+	}
+	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+    match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+	match_record += "\t" + to_string(best_selection.score) + "\n";
+	complex_match_records[thread_index]->push_back(match_record);
+    // add matching result
+    return true;
+bool DiploidVCF::VariantMatchPathCreationByDonor(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id){
+    if(variant_list.size() <= 1) return false;
+    sort(variant_list.begin(), variant_list.end()); // here we need to sort
+    vector<DiploidVariant> separate_var_list[2];
+	// separate into ref and que
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+		int pos = variant_list[i].pos;
+		separate_var_list[flag].push_back(variant_list[i]);
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length()); //exclusive
+	if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+		return false;
+	}
+	if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            complex_ref_match_num[thread_index]++;
+            complex_que_match_num[thread_index]++;
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = to_string(tv.pos) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            complex_match_records[thread_index]->push_back(match_record);
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+	}else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+        int flag = 0;
+        if(separate_var_list[1].size() == 1) flag = 1;
+        int r_flag = 1-flag;
+        if(separate_var_list[r_flag].size() > 4){
+            int total_r_mdl = 0;
+            int total_r_mil = 0;
+            for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+                DiploidVariant var = separate_var_list[r_flag][k];
+                int var_mdl = var.mdl;
+                int var_mil = var.mil;
+                int ref_length = var.ref.length();
+                total_r_mdl += var_mdl;
+                total_r_mil += var_mil;
+            }
+            if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+        }
+	}
+	string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+	ToUpper(subsequence); // subsequence only contains upper char
+	int offset = min_pos;
+	int subsequence_length = max_pos - min_pos;
+	list<VariantSelection> variant_selections; // sorted by last matched donor length
+	VariantSelection best_selection;
+	bool overlap_detected = false;
+    for(int i = 0; i < 2; i++){
+        int largest_pos = 0;
+        for(int k = 0; k < separate_var_list[i].size(); k++){
+            auto var = separate_var_list[i][k];
+            if(var.pos < largest_pos && var.pos+var.ref.length() > largest_pos){
+                overlap_detected = true;
+                break;
+            }
+            largest_pos = max(largest_pos, (int)(var.pos+var.ref.length()));
+        }
+        if(overlap_detected) break;
+    }
+	VariantSelection dummy;
+	dummy.overlap_detected = overlap_detected;
+    variant_selections.push_back(dummy);
+    map<string, int> score_by_consistent_donor; // donor should be sorted
+    while(variant_selections.size() != 0){
+        VariantSelection current_selection = variant_selections.front();
+        variant_selections.pop_front();
+        // all variants has been evaluated, need new variant
+        int previous_var_index = current_selection.cur_var;
+        if(previous_var_index < (int)variant_list.size()-1){
+            bool choose_ref = true;
+            int min_ref_donor = min(current_selection.donor_sequences[0].length(), current_selection.donor_sequences[1].length());
+            int min_que_donor = min(current_selection.donor_sequences[2].length(), current_selection.donor_sequences[3].length());
+            if(min_ref_donor > min_que_donor && current_selection.phasing_vectors[1].size() < separate_var_list[1].size()){
+                choose_ref = false;
+            }
+            if(current_selection.phasing_vectors[0].size() >= separate_var_list[0].size()){
+                choose_ref = false;
+            }
+            DiploidVariant current_variant;
+            if(choose_ref){
+                current_variant = separate_var_list[0][current_selection.phasing_vectors[0].size()];
+            }else{
+                current_variant = separate_var_list[1][current_selection.phasing_vectors[1].size()];
+            }
+            current_selection.cur_var++;
+            int current_flag = current_variant.flag;
+//            cout << "current selection" << endl;
+//            PrintSelection(current_selection);
+//            cout << "add variant";
+//            PrintVariant(current_variant);
+            bool added = false;
+            // make choose decision before not choose decision, save del times
+            added = AddVariantToSelection(variant_selections,
+                                current_selection,
+                                current_variant,
+                                0,
+                                separate_var_list,
+                                subsequence,
+                                offset,
+                                best_selection);
+//            cout << "added state : " << added << endl;
+//            PrintSelectionsList(variant_selections);
+            if(current_variant.heterozygous){
+                added = AddVariantToSelection(variant_selections,
+                                    current_selection,
+                                    current_variant,
+                                    1,
+                                    separate_var_list,
+                                    subsequence,
+                                    offset,
+                                    best_selection);
+//                cout << "added state : " << added << endl;
+//                PrintSelectionsList(variant_selections);
+            }
+           added= AddVariantToSelection(variant_selections,
+                                current_selection,
+                                current_variant,
+                                -1,
+                                separate_var_list,
+                                subsequence,
+                                offset,
+                                best_selection);
+//            cout << "added state : " << added << endl;
+//            PrintSelectionsList(variant_selections);
+        }
+    }
+//    dout << best_selection.score << endl;
+    if (best_selection.score <= 0) return false;
+//    cout << "best selection: " << endl;
+//    PrintSelection(best_selection);
+    complex_ref_match_num[thread_index] += best_selection.separate_score[0];
+    complex_que_match_num[thread_index] += best_selection.separate_score[1];
+    bool multiple_match = true;
+    if(best_selection.donor_sequences[0] == best_selection.donor_sequences[1]) multiple_match = true;
+    string match_record = to_string(offset) + "\t" + subsequence + "\t" + best_selection.donor_sequences[0];
+    if(multiple_match) match_record += "/" + best_selection.donor_sequences[1];
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		auto final_iter = separate_var_list[i].size()-1;
+		vector<int> phasing_vector = best_selection.phasing_vectors[i];
+		for (int k = 0; k < separate_var_list[i].size(); k++) {
+            int phasing = phasing_vector[k];
+            if(phasing == -1) continue;
+            DiploidVariant variant = separate_var_list[i][k];
+            string alt_string = variant.alts[0];
+            if(variant.multi_alts){
+                alt_string += "/" + variant.alts[1];
+            }
+            string phasing_string = "";
+            if(phasing == 0){
+                phasing_string += "1";
+                if(variant.heterozygous){
+                    if(variant.multi_alts){
+                        phasing_string += "|2";
+                    }else{
+                        phasing_string += "|0";
+                    }
+                }else{
+                    phasing_string += "|1";
+                }
+            }else if(phasing == 1){
+                if(variant.multi_alts){
+                    phasing_string += "2|1";
+                }else{
+                    phasing_string += "0|1";
+                }
+            }
+            string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+            vcf_record[i] += variant_record;
+            phasing_record[i] += phasing_string;
+            if (k != final_iter) {
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+	}
+	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+    match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+	match_record += "\t" + to_string(best_selection.score) + "\n";
+	complex_match_records[thread_index]->push_back(match_record);
+    // add matching result
+    return true;
+// code reviewed by Chen on 4/4/2016
+bool DiploidVCF::VariantMatchWithOverlap(vector<DiploidVariant> & variant_list, int thread_index) {
+    if(variant_list.size() <= 1) return false;
+	sort(variant_list.begin(), variant_list.end());
+	map<int, DiploidVariant> separate_pos_var[2];
+	// separate into ref and que
+	int min_pos = genome_sequence.length() + 1;
+	int max_pos = -1;
+	for (int i = 0; i < variant_list.size(); i++) {
+		int flag = variant_list[i].flag; // flag indicate if the variant is from ref set(0) or query set(1)
+		int pos = variant_list[i].pos;
+		separate_pos_var[flag][pos] = variant_list[i];
+		auto ref_sequence = variant_list[i].ref;
+		auto alt_sequences = variant_list[i].alts;
+		min_pos = min(pos, min_pos);
+		max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+	}
+	min_pos = max(min_pos - 1, 0);
+	max_pos = min(max_pos + 1, (int)genome_sequence.length());
+	if (separate_pos_var[0].size() == 0 || separate_pos_var[1].size() == 0) {
+		return false;
+	}
+	string subsequence = genome_sequence.substr(min_pos, max_pos - min_pos);
+	int offset = min_pos;
+	map<int, int> selected_positions[2];
+    FindBestMatchWithOverlap(variant_list,
+                  subsequence,
+                  offset,
+                  0,
+                  separate_pos_var,
+                  selected_positions);
+	if (selected_positions[0].size() == 0 || selected_positions[1].size() == 0) {
+		return false;
+	}
+	complex_ref_match_num[thread_index] += selected_positions[0].size();
+	complex_que_match_num[thread_index] += selected_positions[1].size();
+	return true;
+bool DiploidVCF::FindBestMatchWithOverlap(vector<DiploidVariant> & variant_list,
+	const string subsequence,
+	const int offset,
+	int index,
+	map<int, DiploidVariant> separate_pos_var[],
+	map<int, int> selected_positions[])
+    //set<int> selected_positions[2];
+	vector<int> positions[2]; // 0 from ref, 1 from query
+	vector<bool> indicators[2]; // 0 from ref, 1 from query, indicate if multi_alts(true) or not(false)
+	for (int i = 0; i < 2; i++) {
+		for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+			auto v = it->second;
+			positions[i].push_back(v.pos);
+			indicators[i].push_back(v.multi_alts);
+		}
+	}
+	// construct ref combinations in hash table, key is donor sequence
+    unordered_map<string, vector<vector<int>> > seq_choice_ref;
+    unordered_map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+	for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+		vector<vector<vector<int>>> ref_choice_list = Combine(positions[0], indicators[0], i);
+		for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+            // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+            string donor;
+            int score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], *rit, donor, score);
+            if(CompareSequence(donor, subsequence)) continue;
+            if(seq_choice_ref.find(donor) != seq_choice_ref.end() && seq_score_ref[donor] > score){
+                continue;
+            }else{
+                // either overwrite or insert new
+                seq_choice_ref[donor] = *rit;
+                seq_score_ref[donor] = score;
+            }
+            //dout << "ref-donor: " << donor << endl;
+		}
+	}
+	// now all combinations are stored in hash table seq_choice_ref
+	// search query
+    for(int i = 1; i <= positions[1].size(); i++){
+            // iterate all combinations with i variants
+        vector<vector<vector<int>>> que_choice_list = Combine(positions[1], indicators[1], i);
+        for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+            string donor;
+            int score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], *qit, donor, score);
+            if(CompareSequence(donor, subsequence)) continue;
+            if(seq_choice_ref.find(donor) != seq_choice_ref.end()){
+                // first check if there is heterozygous alleles
+                int total_score = seq_score_ref[donor] + score;
+                if (total_score <= 0) continue;
+                // this time we don't find max, but all, and put them in a set
+                //if(total_score <= max_score) continue;
+                bool local_heter = false;
+                bool local_multi = false;
+                vector<vector<int>> ref_var_choices = seq_choice_ref[donor];
+                vector<vector<int>> que_var_choices = *qit;
+                if(! match_genotype){
+                    for(int k = 0; k < ref_var_choices.size(); k++){
+                        if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+                            selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+                        }
+                    }
+                    for(int k = 0; k < que_var_choices.size(); k++){
+                        if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+                            selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+                        }
+                    }
+                    continue;
+                }
+                vector<vector<int>> ref_other_choices;
+                vector<vector<int>> que_other_choices;
+                // check and construct heterozygous alleles
+                for(int ri = 0; ri < ref_var_choices.size(); ri++){
+                    int ref_pos = ref_var_choices[ri][0];
+                    DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+                    if (ref_variant.multi_alts){
+                        local_multi = true;
+                        ref_other_choices.push_back(vector<int>({ref_pos, 1 - ref_var_choices[ri][1]}));
+                    }else if(ref_variant.heterozygous){
+                        local_heter = true;
+                        ref_other_choices.push_back(vector<int>({ref_pos,-1}));
+                    }else{
+                        ref_other_choices.push_back(vector<int>({ref_pos, ref_var_choices[ri][1]}));
+                    }
+                }
+                // if not find heter, continue checking
+                for(int qi = 0; qi < que_var_choices.size(); qi++){
+                    int que_pos = que_var_choices[qi][0];
+                    DiploidVariant que_variant = separate_pos_var[1][que_pos];
+                    if(que_variant.multi_alts){
+                        local_multi = true;
+                        que_other_choices.push_back(vector<int>({que_pos, 1- que_var_choices[qi][1]}));
+                    }else if (que_variant.heterozygous){
+                        local_heter = true;
+                        que_other_choices.push_back(vector<int>({que_pos, -1}));
+                    }else{
+                        que_other_choices.push_back(vector<int>({que_pos, que_var_choices[qi][1]}));
+                    }
+                }
+                if(local_multi){
+                    // also check the other chromosome matches
+                    int temp_score;
+                    string ref_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+                    string que_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+                    if(CompareSequence(ref_other_donor, que_other_donor)){
+                        for(int k = 0; k < ref_var_choices.size(); k++){
+                            if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+                                selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+                            }
+                        }
+                        for(int k = 0; k < que_var_choices.size(); k++){
+                            if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+                                selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+                            }
+                        }
+                    }
+                }else if(local_heter){
+                    // also check the other chromosome matches
+                    int temp_score;
+                    string ref_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+                    string que_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+                    if(CompareSequence(ref_other_donor, que_other_donor)){
+                        for(int k = 0; k < ref_var_choices.size(); k++){
+                            if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+                                selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+                            }
+                        }
+                        for(int k = 0; k < que_var_choices.size(); k++){
+                            if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+                                selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+                            }
+                        }
+                    }
+                }else{
+                    for(int k = 0; k < ref_var_choices.size(); k++){
+                        if(selected_positions[0].find(ref_var_choices[k][0]) == selected_positions[0].end()){
+                            selected_positions[0][ref_var_choices[k][0]] = ref_var_choices[k][1];
+                        }
+                    }
+                    for(int k = 0; k < que_var_choices.size(); k++){
+                        if(selected_positions[1].find(que_var_choices[k][0]) == selected_positions[1].end()){
+                            selected_positions[1][que_var_choices[k][0]] = que_var_choices[k][1];
+                        }
+                    }
+                    //delay construct optimal solution at the very end.
+                }
+            }
+        }
+    }
+    if(selected_positions[0].size() > 0 && selected_positions[1].size() > 0){
+        vector<vector<int>> ref_set_choices;
+        vector<vector<int>> que_set_choices;
+        for(auto it = selected_positions[0].begin(); it != selected_positions[0].end(); ++it){
+            ref_set_choices.push_back(vector<int>({it->first, it->second}));
+        }
+        for(auto it = selected_positions[1].begin(); it != selected_positions[1].end(); ++it){
+            que_set_choices.push_back(vector<int>({it->first, it->second}));
+        }
+        int temp_score;
+        string ref_set_donor;
+        ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_set_choices, ref_set_donor, temp_score);
+        string que_set_donor;
+        ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_set_choices, que_set_donor, temp_score);
+        if(!CompareSequence(ref_set_donor, que_set_donor)){
+            cout << "Overlap matching does not agree with non-overlap one";
+        }
+    }
+    return true;
+// code reviewed by Chen on 4/3/2016
+bool DiploidVCF::FindBestMatch(vector<DiploidVariant> & variant_list,
+	const string subsequence,
+	const int offset,
+	int index,
+	map<int, DiploidVariant> separate_pos_var[],
+	vector<vector<int>> max_choices[],  // 4 vectors
+	int & max_score,
+	bool & max_heterozygosity,
+	string max_paths[])
+    set<int> selected_positions[2];
+	vector<int> positions[2]; // 0 from ref, 1 from query
+	vector<bool> indicators[2]; // 0 from ref, 1 from query, indicate if multi_alts(true) or not(false)
+	for (int i = 0; i < 2; i++) {
+		for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+			auto v = it->second;
+			positions[i].push_back(v.pos);
+			indicators[i].push_back(v.multi_alts);
+		}
+	}
+	// construct ref combinations in hash table, key is donor sequence
+    unordered_map<string, vector<vector<int>> > seq_choice_ref;
+    unordered_map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+	for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+		vector<vector<vector<int>>> ref_choice_list = Combine(positions[0], indicators[0], i);
+		for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+            // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+            string donor;
+            int score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], *rit, donor, score);
+            if(CompareSequence(donor, subsequence)) continue;
+            if(seq_choice_ref.find(donor) != seq_choice_ref.end() && seq_score_ref[donor] > score){
+                continue;
+            }else{
+                // either overwrite or insert new
+                seq_choice_ref[donor] = *rit;
+                seq_score_ref[donor] = score;
+            }
+            //dout << "ref-donor: " << donor << endl;
+		}
+	}
+	// now all combinations are stored in hash table seq_choice_ref
+	// search query
+    for(int i = 1; i <= positions[1].size(); i++){
+            // iterate all combinations with i variants
+        vector<vector<vector<int>>> que_choice_list = Combine(positions[1], indicators[1], i);
+        for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+            string donor;
+            int score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], *qit, donor, score);
+            if(CompareSequence(donor, subsequence)) continue;
+            if(seq_choice_ref.find(donor) != seq_choice_ref.end()){
+                // first check if there is heterozygous alleles
+                int total_score = seq_score_ref[donor] + score;
+                vector<vector<int>> ref_var_choices = seq_choice_ref[donor];
+                vector<vector<int>> que_var_choices = *qit;
+                // for debug====================
+                if(total_score <= 0) continue;
+                if(! match_genotype){
+                    for(int k = 0; k < ref_var_choices.size(); k++){
+                        selected_positions[0].insert(ref_var_choices[k][0]);
+                    }
+                    for(int k = 0; k < que_var_choices.size(); k++){
+                        selected_positions[1].insert(que_var_choices[k][0]);
+                    }
+                }
+                // for debug====================
+                if(total_score <= max_score) continue;
+                bool local_heter = false;
+                bool local_multi = false;
+                if(! match_genotype){
+                    max_choices[0] = ref_var_choices;
+                    max_choices[2] = que_var_choices;
+                    max_paths[0] = donor;
+                    max_score = total_score;
+                    max_heterozygosity = false;
+                    continue;
+                }
+                vector<vector<int>> ref_other_choices;
+                vector<vector<int>> que_other_choices;
+                // check and construct heterozygous alleles
+                for(int ri = 0; ri < ref_var_choices.size(); ri++){
+                    int ref_pos = ref_var_choices[ri][0];
+                    DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+                    if (ref_variant.multi_alts){
+                        local_multi = true;
+                        ref_other_choices.push_back(vector<int>({ref_pos, 1 - ref_var_choices[ri][1]}));
+                    }else if(ref_variant.heterozygous){
+                        local_heter = true;
+                        ref_other_choices.push_back(vector<int>({ref_pos,-1}));
+                    }else{
+                        ref_other_choices.push_back(vector<int>({ref_pos, ref_var_choices[ri][1]}));
+                    }
+                }
+                // if not find heter, continue checking
+                for(int qi = 0; qi < que_var_choices.size(); qi++){
+                    int que_pos = que_var_choices[qi][0];
+                    DiploidVariant que_variant = separate_pos_var[1][que_pos];
+                    if(que_variant.multi_alts){
+                        local_multi = true;
+                        que_other_choices.push_back(vector<int>({que_pos, 1- que_var_choices[qi][1]}));
+                    }else if (que_variant.heterozygous){
+                        local_heter = true;
+                        que_other_choices.push_back(vector<int>({que_pos, -1}));
+                    }else{
+                        que_other_choices.push_back(vector<int>({que_pos, que_var_choices[qi][1]}));
+                    }
+                }
+                if(local_multi){
+                    // also check the other chromosome matches
+                    int temp_score;
+                    string ref_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+                    string que_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+                    if(CompareSequence(ref_other_donor, que_other_donor)){
+                        max_choices[0] = ref_var_choices;
+                        max_choices[1] = ref_other_choices;
+                        max_choices[2] = que_var_choices;
+                        max_choices[3] = que_other_choices;
+                        max_paths[0] = donor;
+                        max_paths[1] = ref_other_donor;
+                        max_score = total_score;
+                        max_heterozygosity = true;
+                    }
+                }else if(local_heter){
+                    // also check the other chromosome matches
+                    int temp_score;
+                    string ref_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], ref_other_choices, ref_other_donor, temp_score);
+                    string que_other_donor;
+                    ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], que_other_choices, que_other_donor, temp_score);
+                    if(CompareSequence(ref_other_donor, que_other_donor)){
+                        max_choices[0] = ref_var_choices;
+                        max_choices[2] = que_var_choices;
+                        max_paths[0] = donor;
+                        max_score = total_score;
+                        max_heterozygosity = false;
+                    }
+                }else{
+                    max_choices[0] = ref_var_choices;
+                    max_choices[2] = que_var_choices;
+                    max_paths[0] = donor;
+                    max_score = total_score;
+                    max_heterozygosity = false;
+                    //delay construct optimal solution at the very end.
+                }
+            }
+        }
+    }
+    if(max_score > 0){
+        if(max_choices[0].size() < selected_positions[0].size() || max_choices[2].size() < selected_positions[1].size()){
+            //dout << "overlap match differs!" << endl;
+        }
+        return true;
+    }
+    return false;
+vector<vector<vector<int>>> DiploidVCF::DiploidCombine(vector<int> & positions,
+                                                       vector<bool> & heter_indicators,
+                                                       vector<bool> & multi_indicators,
+                                                       int k) {
+	vector<vector<int>> sol;
+	vector<vector<vector<int>>> all_sol;
+	if (k == 0 || k > positions.size()) {
+		return all_sol;
+	}
+	FindDiploidComb(positions,
+        heter_indicators,
+		multi_indicators,
+		0,
+		k,
+		sol,
+		all_sol);
+	return all_sol;
+void DiploidVCF::FindDiploidComb(vector<int> & positions,
+    vector<bool> & heter_indicators,
+	vector<bool> & multi_indicators,
+	int start,
+	int k,
+	vector<vector<int> > & sol,
+	vector<vector<vector<int>>> & all_sol)
+	if (k == 0) {
+		all_sol.push_back(sol);
+		return;
+	}
+	int n = positions.size();
+	for (int i = start; i <= n - k; i++) {
+		sol.push_back(vector<int>({ positions[i], 0 }));
+		FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+		sol.pop_back();
+		if (heter_indicators[i]) { // try second allele
+            int second_allele = -1;
+            if(multi_indicators[i]){
+                second_allele = 1;
+            }
+			sol.push_back(vector<int>({ positions[i], second_allele }));
+			FindComb(positions, multi_indicators, i + 1, k - 1, sol, all_sol);
+			sol.pop_back();
+		}
+	}
+bool DiploidVCF::FindBestDiploidMatch(vector<DiploidVariant> & variant_list,
+	const string subsequence,
+	const int offset,
+	int index,
+	map<int, DiploidVariant> separate_pos_var[],
+	vector<vector<int>> max_choices[],  // 4 vectors
+	int & max_score,
+	bool & max_heterozygosity,
+	string max_paths[]){
+	vector<int> positions[2]; // 0 from ref, 1 from query
+	vector<bool> heter_indicators[2]; // 0 from ref, 1 from query, indicate if heterozygous(true) or not(false)
+	vector<bool> multi_indicators[2]; // indicate if contains multi alt, if heter but not multi, then the other choice is ref(-1)
+	for (int i = 0; i < 2; i++) {
+		for (auto it = separate_pos_var[i].begin(); it != separate_pos_var[i].end(); ++it) {
+			DiploidVariant v = it->second;
+			positions[i].push_back(v.pos);
+			heter_indicators[i].push_back(v.heterozygous);
+			multi_indicators[i].push_back(v.multi_alts);
+		}
+	}
+    map<string, vector<vector<int>> > seq_choice1_ref;
+    map<string, vector<vector<int>> > seq_choice2_ref;
+    map<string, int> seq_score_ref; // corresponding score, if same key, store the one with highest score
+	for (int i = 1; i <= positions[0].size(); i++) { // i : how many variants are chosen
+		vector<vector<vector<int>>> ref_choice_list = DiploidCombine(positions[0], heter_indicators[0], multi_indicators[0], i);
+		for (auto rit = ref_choice_list.begin(); rit != ref_choice_list.end(); ++rit) { // iterate all combinations with i variants
+            // each combination is a vector of pairs(position, alt_index), alt_index is 0 or 1 (if multi_alts)
+            vector<vector<int>> one_choice = *rit;
+            vector<vector<int>> another_choice;
+            // generate another choice;
+            bool multi_chr = false;
+            for(int ri = 0; ri < one_choice.size(); ri++){
+                int ref_pos = one_choice[ri][0];
+                DiploidVariant ref_variant = separate_pos_var[0][ref_pos];
+                if (ref_variant.multi_alts){
+                    multi_chr = true;
+                    another_choice.push_back(vector<int>({ref_pos, 1 - one_choice[ri][1]}));
+                }else if(ref_variant.heterozygous){
+                    multi_chr = true;
+                    int another_allele = -1;
+                    if(one_choice[ri][1] == -1) another_allele = 0;
+                    another_choice.push_back(vector<int>({ref_pos,another_allele}));
+                }else{
+                    another_choice.push_back(vector<int>({ref_pos, one_choice[ri][1]}));
+                }
+            }
+            string one_donor;
+            string another_donor;
+            int one_score;
+            int another_score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], one_choice, one_donor, one_score);
+            if(multi_chr){
+                ModifyRefMultiVar(subsequence, offset, separate_pos_var[0], another_choice, another_donor, another_score);
+            }else{
+                another_donor = one_donor;
+            }
+            string donor;
+            if(one_donor < another_donor){
+                donor = one_donor + "," + another_donor;
+            }else{
+                donor = another_donor + "," + one_donor;
+            }
+            // key will be donor string
+            if(CompareSequence(donor, subsequence+","+subsequence)) continue;
+            int score = one_score;
+            if(seq_choice1_ref.find(donor) != seq_choice1_ref.end() && seq_score_ref[donor] > score){
+                continue;
+            }else{
+                // either overwrite or insert new
+                seq_choice1_ref[donor] = one_choice;
+                seq_choice2_ref[donor] = another_choice;
+                seq_score_ref[donor] = score;
+            }
+            //dout << "ref-donor: " << donor << endl;
+		}
+	}
+	// by now generate all combinations of ref variant set, with sorted donor sequences as key
+    for(int i = 1; i <= positions[1].size(); i++){
+            // iterate all combinations with i variants
+        vector<vector<vector<int>>> que_choice_list = DiploidCombine(positions[1], heter_indicators[1], multi_indicators[1], i);
+        for (auto qit = que_choice_list.begin(); qit != que_choice_list.end(); ++qit){
+            vector<vector<int>> one_choice = *qit;
+            vector<vector<int>> another_choice;
+            bool multi_chr = false;
+            for(int qi = 0; qi < one_choice.size(); qi++){
+                int que_pos = one_choice[qi][0];
+                DiploidVariant que_variant = separate_pos_var[1][que_pos];
+                if(que_variant.multi_alts){
+                    multi_chr = true;
+                    another_choice.push_back(vector<int>({que_pos, 1- one_choice[qi][1]}));
+                }else if (que_variant.heterozygous){
+                    multi_chr = true;
+                    int another_allele = -1;
+                    if(one_choice[qi][1] == -1) another_allele = 0;
+                    another_choice.push_back(vector<int>({que_pos, another_allele}));
+                }else{
+                    another_choice.push_back(vector<int>({que_pos, one_choice[qi][1]}));
+                }
+            }
+            string one_donor;
+            string another_donor;
+            int one_score;
+            int another_score;
+            ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], one_choice, one_donor, one_score);
+            if(multi_chr){
+                ModifyRefMultiVar(subsequence, offset, separate_pos_var[1], another_choice, another_donor, another_score);
+            }else{
+                another_donor = one_donor;
+            }
+            string donor;
+            if(one_donor < another_donor){
+                donor = one_donor + "," + another_donor;
+            }else{
+                donor = another_donor + "," + one_donor;
+            }
+            if(seq_choice1_ref.find(donor) != seq_choice1_ref.end()){
+                int total_score = seq_score_ref[donor] + one_score;
+                if (total_score > max_score){
+                    max_score = total_score;
+                    max_paths[0] = one_donor;
+                    max_paths[1] = another_donor;
+                    max_heterozygosity = multi_chr;
+                    max_choices[0] = seq_choice1_ref[donor];
+                    max_choices[1] = seq_choice2_ref[donor];
+                    max_choices[2] = one_choice;
+                    max_choices[3] = another_choice;
+                }
+            }
+        }
+    }
+    return true;
+//[todo] support variant match without hyplotype
+// code reviewed by Channing on 4/3/2016
+void DiploidVCF::ModifyRefMultiVar(const string & genome,
+                                   int offset,
+                                   map<int, DiploidVariant> & pos_var,
+                                   vector<vector<int>> pos_choice,
+                                   string & donor,
+                                   int & score) {
+    donor = genome;
+    score = 0; // if return before end of function, score = 0
+    int local_score = 0;
+    transform(donor.begin(), donor.end(), donor.begin(), ::toupper);
+    int start_pos = 0;
+    std::sort(pos_choice.begin(), pos_choice.end(),
+          [](const std::vector<int>& a, const std::vector<int>& b) {
+            return a[0]>b[0];}); // sorted by position in reverse order
+    for(int i = 0; i < pos_choice.size(); i++){
+        assert(pos_choice[i].size() == 2);
+        int pos = pos_choice[i][0];
+        int alt_index = pos_choice[i][1];
+        int offset_pos = pos - offset;
+        DiploidVariant variant = pos_var[pos];
+        if(alt_index > 0 && !variant.multi_alts){
+            dout << "[VarMatch] Warning: modify reference genome with allele not exist" << endl;
+            return;
+        }
+        int offset_end = offset_pos + (int) variant.ref.length();
+        string alt = "";
+        if(alt_index >= 0){
+            alt = variant.alts[alt_index];
+        }else{
+            alt = variant.ref;
+        }
+        int donor_length = donor.length();
+		if(offset_pos > donor_length || offset_end > donor_length){
+            //dout << "[VarMatch] Warning: overlapping variants detected." << endl; // the most reason is overlapping variants
+            return;
+		}
+		donor = donor.substr(0, offset_pos) + alt + donor.substr(offset_end, donor_length - offset_end);
+		if(scoring_basepair){
+            local_score += variant.ref.size();
+		}else{
+            local_score++;
+		}
+    }
+    transform(donor.begin(), donor.end(), donor.begin(), ::toupper);
+    //only assign score here, if fail to change reference, score will be 0
+    score = local_score;
+    return;
+int DiploidVCF::test() {
+//	genome_sequence = "GTCAGCCGG";
+//	DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0);
+//	DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0);
+//	DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0); // this is false negative
+//	DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0);
+//	DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0);
+//	DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 1);
+//	DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1);
+//	DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 1);
+//    complex_ref_match_num.push_back(0);
+//    complex_que_match_num.push_back(0);
+//    complex_match_records = new vector<string>*[1];
+//    complex_match_records[0] = new vector<string>;
+//	//vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+//	vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+//	cout << VariantMatchPathCreation(var_list, 0,0) << endl;
+	return 0;
+//int DiploidVCF::test() {
+//	genome_sequence = "AATATAT";
+//	DiploidVariant d1(0, vector<char>({ 'D', 'S' }), "AAT", vector<string>({ "A", "A" }), "1/2", false, false, 0);
+//	DiploidVariant d2(0, vector<char>({ 'D', 'S' }), "AAT", vector<string>({ "A", "" }), "0/1", true, false, 1);
+//	DiploidVariant d3(4, vector<char>({ 'D', 'S' }), "TAT", vector<string>({ "T", "" }), "0/1", true, false, 1);
+//	complex_ref_match_num.push_back(0);
+//	complex_que_match_num.push_back(0);
+//    complex_match_records = new vector<string>*[1];
+//    complex_match_records[0]= new vector<string>;
+//	//vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+//	vector<DiploidVariant> var_list = { d1,d2,d3 };
+//	cout << VariantMatchPathCreation(var_list, 0) << endl;
+//	return 0;
+void DiploidVCF::SortVariantList(){
+    sort(variant_list.begin(), variant_list.end());
+// code reviewed by Chen on 4/4/2016
+void DiploidVCF::ClusteringVariants() {
+	// in DiploidVariant, flag = 0 is reference, flag = 1 is query
+//	for (int i = 0; i < refpos_2_var.size(); i++) {
+//		auto & m = refpos_2_var[i];
+//		for (auto it = m.begin(); it != m.end(); ++it) {
+//			auto v = it->second;
+//			if (v.flag != 0) {
+//				v.flag = 0;
+//			}
+//            //if(v.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+//			variant_list.push_back(v);
+//		}
+//	}
+//	for (int i = 0; i < querypos_2_var.size(); i++) {
+//		auto & m = querypos_2_var[i];
+//		for (auto it = m.begin(); it != m.end(); ++it) {
+//			auto v = it->second;
+//			v.flag = 1;
+//			variant_list.push_back(v);
+//		}
+//	}
+//	if (variant_list.size() == 0)
+//		return;
+    dsptime();
+    sort(variant_list.begin(), variant_list.end());
+    dsptime();
+	int cluster_index = 0;
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = 0;
+	int c_end = 0;
+	for (int i = 0; i < variant_list.size(); i++) {
+		auto snp = variant_list[i];
+		// check if need to separator clusters
+		if (i > 0) {
+			c_end = snp.pos;
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequence.substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    cluster_index++;
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_start = 0; // re-assign c_start
+				}
+			}
+		}
+		c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+		// assign snp to cluster
+		//if(snp.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+		cluster_vars_map[cluster_index].push_back(snp);
+		int ref_length = (int)(snp.ref.length());
+		int flag = snp.flag;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        int snp_ins = max(0, (int)snp.alts[0].length() - (int)snp.ref.length());
+        int snp_del = max(0, (int)snp.ref.length() - (int)snp.alts[0].length());
+        if(snp.multi_alts){
+            int snp_ins = max(snp_ins, (int)snp.alts[1].length() - (int)snp.ref.length());
+            int snp_del = max(snp_del, (int)snp.ref.length() - (int)snp.alts[1].length());
+        }
+        ins_len[flag] += snp_ins;
+        del_len[flag] += snp_del;
+	}
+void DiploidVCF::LinearClusteringVariants() {
+	int cluster_index = 0;
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = 0;
+	int c_end = 0;
+    sort(ref_variant_list.begin(), ref_variant_list.end());
+    sort(que_variant_list.begin(), que_variant_list.end());
+    int ref_size = ref_variant_list.size();
+    int que_size = que_variant_list.size();
+    int ref_index = 0;
+    int que_index = 0;
+    bool not_first = false;
+    DiploidVariant snp;
+    while (ref_index < ref_size || que_index < que_size) {
+		bool take_que = true;
+		if(ref_index < ref_size && que_index < que_size){
+            if(ref_variant_list[ref_index].pos < que_variant_list[que_index].pos){
+                take_que = false;
+            }
+		}else if(ref_index < ref_size){
+            take_que = false;
+		}
+		if(take_que){
+            snp = que_variant_list[que_index];
+            //cout << "q |" << que_index << "," << snp.pos << endl;
+            que_index++;
+		}else{
+            snp = ref_variant_list[ref_index];
+            //cout << "r |" << ref_index << "," << snp.pos << endl;
+            ref_index++;
+		}
+		// check if need to separator clusters
+		if (not_first) {
+			c_end = snp.pos;
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequence.substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    cluster_index++;
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_start = 0; // re-assign c_start
+				}
+			}
+		}
+		c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+		// assign snp to cluster
+		if(snp.pos == 142536905) cout << cluster_index << endl;
+		cluster_vars_map[cluster_index].push_back(snp);
+		if(!not_first) not_first = true;
+		int ref_length = (int)(snp.ref.length());
+		int flag = snp.flag;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        ins_len[flag] += snp.mil;
+        del_len[flag] += snp.mdl;
+	}
+void DiploidVCF::ReverseLinearClusteringVariants() {
+	int cluster_index = 0;
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = std::numeric_limits<int>::max();
+	int c_end = std::numeric_limits<int>::max();
+    sort(ref_variant_list.begin(), ref_variant_list.end());
+    sort(que_variant_list.begin(), que_variant_list.end());
+    int ref_size = ref_variant_list.size();
+    int que_size = que_variant_list.size();
+    int ref_index = ref_size-1;
+    int que_index = que_size-1;
+    bool not_first = false;
+    DiploidVariant snp;
+    while (ref_index >= 0 || que_index >= 0) {
+		bool take_que = true;
+		if(ref_index >= 0 && que_index >= 0){
+            if(ref_variant_list[ref_index].pos + ref_variant_list[ref_index].ref.size() > que_variant_list[que_index].pos+que_variant_list[que_index].ref.size()){
+                take_que = false;
+            }
+		}else if(ref_index >= 0){
+            take_que = false;
+		}
+		if(take_que){
+            snp = que_variant_list[que_index];
+            que_index--;
+		}else{
+            snp = ref_variant_list[ref_index];
+            ref_index--;
+		}
+		// check if need to separator clusters
+		if (not_first) {
+			c_start = snp.pos + snp.ref.size();
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequence.substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    cluster_index++;
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_end = std::numeric_limits<int>::max(); // re-assign c_start
+				}
+			}
+		}
+		c_end = min(c_end, snp.pos);
+		// assign snp to cluster
+		//if(snp.pos == -1) cout << "@@@@@@@@@@@@@" << endl;
+		cluster_vars_map[cluster_index].push_back(snp);
+		if(!not_first) not_first = true;
+		int ref_length = (int)(snp.ref.length());
+		int flag = snp.flag;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        ins_len[flag] += snp.mil;
+        del_len[flag] += snp.mdl;
+	}
+void DiploidVCF::DivisiveHierarchicalClustering(list<vector<DiploidVariant>> & snp_clusters){
+    // I use list of vectors instead of vector of vectors, to take advantage of member func of list
+    if(snp_clusters.size() == 0) return;
+    bool flag = true;
+    list<bool> potential_list;
+    for(int i = 0; i < snp_clusters.size(); i++){
+        potential_list.push_back(true);
+    }
+    int previous_variant_num = snp_clusters.front().size();
+    while(flag){
+        flag = false;
+        int list_size = snp_clusters.size();
+        for(int i = 0; i < list_size; i++){
+            auto front_cluster = snp_clusters.front();
+            auto front_posential = potential_list.front(); // record if this can be separated
+            // at the very beginning, all clusters can be separated
+            // all newly separated sub-clusters can be separated
+            // if one cluster marked not separated, then it can never be separated
+            snp_clusters.pop_front();
+            potential_list.pop_front();
+            if(front_cluster.size() == 1){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+            if(! front_posential){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(front_posential);
+                continue;
+            }
+            int max_start = -1;
+            int max_end = -1;
+            int max_length = -1;
+            int start = front_cluster[0].pos + (int)front_cluster[0].ref.length();
+            // find the largest gap, see if we can separate from that gap
+            for(int k = 0; k < front_cluster.size(); k++){
+                auto snp = front_cluster[k];
+                auto snp_pos = snp.pos;
+                if(max_length < snp_pos - start){
+                    max_length = snp_pos - start;
+                    max_start = start;
+                    max_end = snp_pos;
+                }
+            }
+            if(max_length <= 0){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+            int left_ins[2] = {0};
+            int left_del[2] = {0};
+            int right_ins[2] = {0};
+            int right_del[2] = {0};
+            vector<DiploidVariant> left_snp_list;
+            vector<DiploidVariant> right_snp_list;
+            string separator = genome_sequence.substr(max_start, max_end-max_start);
+            for(int k = 0; k < front_cluster.size(); k++){
+                DiploidVariant snp = front_cluster[k];
+                int rq = snp.flag;
+                int snp_ins = max(0, (int)snp.alts[0].length() - (int)snp.ref.length());
+                int snp_del = max(0, (int)snp.ref.length() - (int)snp.alts[0].length());
+                if(snp.multi_alts){
+                    int snp_ins = max(snp_ins, (int)snp.alts[1].length() - (int)snp.ref.length());
+                    int snp_del = max(snp_del, (int)snp.ref.length() - (int)snp.alts[1].length());
+                }
+                if(snp.pos <= max_start){
+                    left_ins[rq] += snp_ins;
+                    left_del[rq] += snp_del;
+                    left_snp_list.push_back(snp);
+                }else{
+                    right_ins[rq] += snp_ins;
+                    right_del[rq] += snp_del;
+                    right_snp_list.push_back(snp);
+                }
+            }
+            //check
+            if(left_snp_list.size() == 0 || right_snp_list.size() == 0){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+            int mcll = max(left_del[0]+left_ins[1], left_del[1]+left_ins[0]);
+            int mclr = max(right_del[0]+right_ins[1], right_del[1]+right_ins[0]);
+            int min_mcl = min(mcll, mclr);
+            if ((int)separator.length() > 2 * min_mcl &&
+                    ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, min_mcl)))
+            {
+                flag = true;
+                snp_clusters.push_back(left_snp_list);
+                potential_list.push_back(true);
+                snp_clusters.push_back(right_snp_list);
+                potential_list.push_back(true);
+            }else{
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+        }
+        int current_variant_num = 0;
+        for(auto it = snp_clusters.begin(); it != snp_clusters.end(); ++it){
+            current_variant_num += (*it).size();
+        }
+        if(current_variant_num != previous_variant_num){
+            dout << "[VarMatch] Error during clustering" << endl;
+        }
+    }
+    return;
+// private
+// code reviewed
+bool DiploidVCF::ClusteringMatchInThread(int start, int end, int thread_index) {
+    // end exclusive
+	map<int, int> size_of_cluster;
+	for (int cluster_id = start; cluster_id < end; cluster_id++) {
+		if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+			auto & var_list = cluster_vars_map[cluster_id];
+//			int var_list_size = var_list.size();
+//			if(size_of_cluster.find(var_list_size) != size_of_cluster.end()){
+//                size_of_cluster[var_list_size] ++;
+//			}else{
+//                size_of_cluster[var_list_size] = 1;
+//			}
+			if (var_list.size() > 100){
+                cout << cluster_id << ":" ;
+                cout << var_list.size() << endl;
+            }
+			//bool method1 = VariantMatchPathCreationByDonor(var_list, thread_index, cluster_id);
+			bool method2 = AcceleratedVariantMatchPathCreation(var_list, thread_index, cluster_id);
+//			if(method1 != method2){
+//                cout << "not match" << endl;
+//			}
+		}
+	}
+//	for(auto it = size_of_cluster.begin(); it != size_of_cluster.end(); ++it){
+//        cout << it->first << "\t" << it->second << endl;
+//	}
+	return true;
+// private
+void DiploidVCF::ClusteringMatchMultiThread() {
+	clustering_search = true;
+	int start = cluster_vars_map.begin()->first; // start cluster id
+	int cluster_number = cluster_vars_map.size(); // cluster number
+	int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+	int cluster_step = cluster_number / thread_num; // assign clusters to threads
+	if (cluster_step * thread_num < cluster_number) cluster_step++;
+	int end = start + cluster_step;
+	//initialize vector size
+	complex_match_records = new vector<string>*[thread_num];
+	for (int j = 0; j < thread_num; j++) {
+		complex_match_records[j] = new vector<string>;
+		complex_ref_match_num.push_back(0);
+		complex_que_match_num.push_back(0);
+	}
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+		int variant_number = 0;
+		for (int cluster_id = start; cluster_id < end; cluster_id++) {
+			if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+				variant_number += cluster_vars_map[cluster_id].size();
+			}
+		}
+		threads.push_back(thread(&DiploidVCF::ClusteringMatchInThread, this, start, end, i));
+		start = end;
+		end = start + cluster_step;
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	if (start >= cluster_vars_map.size()) {
+		dout << "[Error] index out of map range" << endl;
+	}
+	else {
+		int variant_number = 0;
+		for (int cluster_id = start; cluster_id < end; cluster_id++) {
+			if (cluster_vars_map.find(cluster_id) != cluster_vars_map.end()) {
+				variant_number += cluster_vars_map[cluster_id].size();
+			}
+		}
+		ClusteringMatchInThread(start, end, i);
+	}
+	// call join() on each thread in turn before this function?
+	std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+	ofstream output_complex_file;
+	output_complex_file.open(output_complex_filename);
+	output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+	output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+	output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+	for (int i = 0; i < thread_num; i++) {
+		for (int j = 0; j < complex_match_records[i]->size(); j++) {
+			if (complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos) {
+				output_complex_file << chromosome_name << "\t" << complex_match_records[i]->at(j);
+			}
+		}
+	}
+	output_complex_file.close();
+	for (int j = 0; j < thread_num; j++) {
+		delete complex_match_records[j];
+	}
+	delete[] complex_match_records;
+	total_ref_complex = 0;
+	total_que_complex = 0;
+	for (int i = 0; i < complex_ref_match_num.size(); i++)
+		total_ref_complex += complex_ref_match_num[i];
+	for (int i = 0; i < complex_que_match_num.size(); i++)
+		total_que_complex += complex_que_match_num[i];
+	cout << "complex match: " << total_ref_complex << "," << total_que_complex << endl;
+// for public access
+void DiploidVCF::Compare(string ref_vcf,
+	string query_vcf,
+	string genome_seq,
+	bool direct_search,
+	string output_prefix,
+	bool match_genotype,
+	bool normalization,
+	bool score_basepair,
+	bool overlap_match,
+	bool variant_check) {
+	ref_vcf_filename = ref_vcf;
+	que_vcf_filename = query_vcf;
+	this->normalization = normalization;
+	this->scoring_basepair = score_basepair;
+	this->overlap_match = overlap_match;
+	this->variant_check = variant_check;
+	this->match_genotype = match_genotype;
+	output_stat_filename = output_prefix + ".stat";
+    output_complex_filename = output_prefix + ".match";
+	//------------read genome sequence and decide boundary according to thread number
+	dsptime();
+	dout << " Read genome sequence file... " << endl;
+	ReadGenome(genome_seq);
+	dsptime();
+	dout << " Finish reading genome sequence file." << endl;
+	//------------read ref and query vcf file
+	int ref_total_num = 0;
+	int que_total_num = 0;
+    dsptime();
+    dout << " Read reference vcf file... " << endl;
+    ref_total_num = ReadRefVCF(ref_vcf);
+    dsptime();
+    dout << " Read query vcf file... " << endl;
+    que_total_num = ReadQueryVCF(query_vcf);
+    dsptime();
+    dout << " Finish reading all vcf file." << endl;
+    dout << " total variants: " << ref_total_num << "," << que_total_num << endl;
+	//-------------clustering search
+	dsptime();
+	dout << " Clustering snps ... " << endl;
+	LinearClusteringVariants();
+	dsptime();
+	dout << " Finish clustering." << endl;
+	dsptime();
+	dout << " Clustering search ... " << endl;
+	ClusteringMatchMultiThread();
+	dsptime();
+	dout << " Finish clustering search." << endl;
+	dout << " total match: " << total_ref_complex << "," << total_que_complex << endl;
+	int ref_mismatch_num = ref_total_num - total_ref_complex;
+	int que_mismatch_num = que_total_num - total_que_complex;
+	dout << " mismatch: " << ref_mismatch_num << "," << que_mismatch_num << endl;
+    ofstream output_stat_file;
+    output_stat_file.open(output_stat_filename);
+    output_stat_file << ref_total_num << endl;
+    output_stat_file << que_total_num << endl;
+    output_stat_file << total_ref_complex << endl;
+    output_stat_file << total_que_complex << endl;
+    output_stat_file << ref_mismatch_num << endl;
+    output_stat_file << que_mismatch_num << endl;
+    output_stat_file.close();
+	return;
diff --git a/src/diploid.h b/src/diploid.h
new file mode 100644
index 0000000..2010082
--- /dev/null
+++ b/src/diploid.h
@@ -0,0 +1,342 @@
+#pragma once
+#include "vcf.h"
+// data structure for direct search
+class DiploidVariant {
+    DiploidVariant(int pos_ = -1,
+        string ref_ = "",
+        vector<string> alts_ = {"",""},
+        bool heterozygous_ = false,
+        bool multi_alts_ = false,
+        int mdl_ = 0,
+        int mil_ = 0,
+        int flag_ = 0) :
+        pos(pos_),
+        ref(ref_),
+        alts(alts_),
+        heterozygous(heterozygous_),
+        multi_alts(multi_alts_),
+        mdl(mdl_),
+        mil(mil_),
+        flag(flag_){}
+    int pos;
+    string ref;
+    vector<string> alts;
+    bool heterozygous;
+    bool multi_alts;
+    int mdl;
+    int mil;
+    int flag; //in DiploidVariant, flag = 0 is reference, flag = 1 is query
+//    int get_pos() const{return pos};
+//    string get_ref() const{return ref};
+//    vector<string> get_alts() const{return alts};
+//    bool get_heterozygous() const{return heterozygous};
+//    bool get_multi_alts() const{return multi_alts};
+    bool operator <(const DiploidVariant& y) const {
+        return pos < y.pos;
+    }
+    // this is based on the assumption that all sequence are in upper case
+    bool operator ==(const DiploidVariant& y) {
+        if (pos == y.pos && ref == y.ref) {
+            if(heterozygous == y.heterozygous && multi_alts == y.multi_alts){
+                if (multi_alts && heterozygous) {
+                    int match_times = 0;
+                    for (int i = 0; i < 2; i++) {
+                        for (int j = 0; j < 2; j++) {
+                            if (alts[i] == y.alts[j])
+                                match_times++;
+                        }
+                    }
+                    if (match_times >= 2)
+                        return true;
+                }
+                else if(alts[0] == y.alts[0]){
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+    bool DirectCompare(const DiploidVariant& y){
+        if (pos == y.pos && ref == y.ref) {
+            if (multi_alts && heterozygous && y.multi_alts && y.heterozygous) {
+                int match_times = 0;
+                for (int i = 0; i < 2; i++) {
+                    for (int j = 0; j < 2; j++) {
+                        if (alts[i] == y.alts[j])
+                            match_times++;
+                    }
+                }
+                if (match_times > 0)
+                    return true;
+            }
+            else if(alts[0] == y.alts[0]){
+                return true;
+            }
+        }
+        return false;
+    }
+    bool CompareNoGenotype(const DiploidVariant & y){
+        if(pos == y.pos && ref == y.ref){
+            if(alts[0] == y.alts[0]) return true;
+            if(multi_alts){
+                if(alts[1] == y.alts[0]) return true;
+                if(y.multi_alts && alts[1] == y.alts[1]){
+                    return true;
+                }
+            }
+            if(y.multi_alts && alts[0] == y.alts[1]){
+                return true;
+            }
+        }
+        return false;
+    }
+// define outside of struct, idiomatic solution for lexicographical compare for structures
+//bool operator <(const DiploidVariant& x, const DiploidVariant& y);
+//bool operator ==(const DiploidVariant& x, const DiploidVariant& y);
+class VariantSelection{
+    int score;
+    int separate_score[2];
+    int min_genome_pos; // min(donor_sequence[0], donor_sequence[2])
+    bool haplotypes_consistent;
+    int genome_position[2]; // genome position that has been considered, exclusive
+    int donor_length[2];
+    string donor_sequences[4];
+    vector<int> pos_vectors[2]; // selected variants, not necessary now
+    vector<int> phasing_vectors[2]; // phasing vector for corresponding variant, for D_0
+    int cur_var;
+    bool overlap_detected;
+    VariantSelection(){
+        score = 0;
+        cur_var = -1;
+        min_genome_pos = -1;
+        haplotypes_consistent = false;
+        overlap_detected = false;
+        for(int i = 0; i < 2; i++){
+            separate_score[i] = 0;
+            genome_position[i] = 0;
+            donor_length[i] = 0;
+            pos_vectors[i] = vector<int>();
+            phasing_vectors[i] = vector<int>();
+            donor_sequences[i] = "";
+        }
+        donor_sequences[2] = "";
+        donor_sequences[3] = "";
+    }
+    bool operator< (const VariantSelection& rhs) const // sort by min_genome_position
+    {
+        return min_genome_pos < rhs.min_genome_pos;
+    }
+class DiploidVCF : public VCF
+    typedef vector<unordered_map<int, DiploidVariant > > VariantHash;
+    typedef vector<map<int, DiploidVariant > > VariantMap;
+	VariantHash refpos_2_var;
+	VariantHash querypos_2_var;
+	vector<DiploidVariant> variant_list;
+	vector<DiploidVariant> ref_variant_list;
+	vector<DiploidVariant> que_variant_list;
+    int ReadRefVCF(string filename);
+    int ReadQueryVCF(string filename);
+	void DirectSearchInThread(unordered_map<int, DiploidVariant> & ref_snps,
+                           unordered_map<int, DiploidVariant> & query_snps,
+                           int thread_index);
+	void DirectSearchMultiThread();
+    void ClusteringVariants();
+    bool ClusteringMatchInThread(int, int, int);
+	void ClusteringMatchMultiThread();
+	ofstream offf;
+	const time_t ctt = time(0);
+    vector<int> complex_ref_match_num;
+	vector<int> complex_que_match_num;
+    int total_ref_complex;
+	int total_que_complex;
+	bool scoring_basepair;
+	bool overlap_match;
+	bool variant_check;
+	map<int, vector<DiploidVariant> > cluster_vars_map;
+	void DecideBoundaries();
+	int ReadDiploidVCF(string filename, vector<DiploidVariant> & x_variant_list, int flag);
+	bool NormalizeDiploidVariant(DiploidVariant & var);
+	bool VariantMatch(vector<DiploidVariant> & variant_list, int thread_index);
+    bool FindBestMatch(vector<DiploidVariant> & variant_list,
+		const string subsequence,
+		const int offset,
+		int index,
+		map<int, DiploidVariant> separate_pos_var[],
+		vector<vector<int>> max_choices[],  // 4 vectors
+		int & max_score,
+		bool & max_heterozygosity,
+		string max_paths[]); //only two
+    bool FindBestDiploidMatch(vector<DiploidVariant> & variant_list,
+		const string subsequence,
+		const int offset,
+		int index,
+		map<int, DiploidVariant> separate_pos_var[],
+		vector<vector<int>> max_choices[],  // 4 vectors
+		int & max_score,
+		bool & max_heterozygosity,
+		string max_paths[]); //only two
+    int CheckPrefix(const string subsequence,
+		const int offset,
+		map<int, DiploidVariant> separate_pos_var[],
+		map<int, int> choices[],
+		string cur_paths[]);
+	bool RecurrentVariantMatch(vector<DiploidVariant> & variant_list, int thread_index);
+	void RecurrentMatchWithIndel(vector<DiploidVariant> & variant_list,
+		const string subsequence,
+		const int offset,
+		int index,
+		map<int, DiploidVariant> separate_pos_var[],
+		map<int, int> choices[], // 4 vectors
+		map<int, int> max_matches[],  // 4 vectors
+		int & max_score,
+		string max_paths[]);
+	vector<vector<vector<int>>> Combine(vector<int> & positions,
+                                     vector<bool> & multi_indicators,
+                                     int k);
+	void FindComb(vector<int> & positions,
+		vector<bool> & multi_indicators,
+		int start,
+		int k,
+		vector<vector<int> > & sol,
+		vector<vector<vector<int>>> & all_sol);
+    vector<vector<vector<int>>> DiploidCombine(vector<int> & positions,
+                                               vector<bool> & heter_indicators,
+                                               vector<bool> & multi_indicators,
+                                               int k);
+    void FindDiploidComb(vector<int> & positions,
+        vector<bool> & heter_indicators,
+        vector<bool> & multi_indicators,
+        int start,
+        int k,
+        vector<vector<int> > & sol,
+        vector<vector<vector<int>>> & all_sol);
+    void ModifyRefMultiVar(const string & ref,
+                           int offset,
+                           map<int, DiploidVariant> & pos_var,
+                           vector<vector<int>> pos_choice,
+                           string & donor,
+                           int & score);
+    void DivisiveHierarchicalClustering(list<vector<DiploidVariant> > & snp_clusters);
+    bool VariantMatchWithOverlap(vector<DiploidVariant> & variant_list, int thread_index);
+    bool FindBestMatchWithOverlap(vector<DiploidVariant> & variant_list,
+                                const string subsequence,
+                                const int offset,
+                                int index,
+                                map<int, DiploidVariant> separate_pos_var[],
+                                map<int, int> selected_positions[]);
+    bool VariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+    bool CollapseSelections(VariantSelection selection,
+                            list<VariantSelection> & variant_selections);
+    int CheckDonorSequences(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      string donor_sequences[]);
+      bool AddVariantToSelection(list<VariantSelection> & variant_selections,
+       VariantSelection selection,
+       DiploidVariant variant,
+       int haplotype,
+       vector<DiploidVariant> separate_var_list[],
+       const string & subsequence,
+       int offset,
+       VariantSelection & best_selection);
+    void SortVariantList();
+    void ReadGenome(string filename);
+    void LinearClusteringVariants();
+    int NormalizeVariantSequence(int pos,
+                                 string & parsimonious_ref,
+                                 string & parsimonious_alt0,
+                                 string & parsimonious_alt1);
+    int ExtendingDonorSequences(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      int flag);
+    bool CollapsePrefixMatchSelection(VariantSelection selection,
+                                    list<VariantSelection> & variant_selections);
+    void ReverseLinearClusteringVariants();
+    bool AcceleratedVariantMatchPathCreation(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+    bool VariantMatchPathCreationByDonor(vector<DiploidVariant> & variant_list, int thread_index, int cluster_id);
+    int CheckDonorSequencesWithOverlap(vector<DiploidVariant> separate_var_list[],
+                                      VariantSelection & selection,
+                                      const string & subsequence,
+                                      int offset,
+                                      string donor_sequences[]);
+    void PrintVariant(DiploidVariant var);
+	DiploidVCF(int thread_num_);
+	~DiploidVCF();
+	const static int VAR_LEN = 100;
+    int test();
+    // for public access
+	void Compare(string ref_vcf,
+		string query_vcf,
+		string genome_seq,
+		bool direct_search,
+		string output_prefix,
+		bool match_genotype,
+		bool normalization,
+		bool scoring_basepair,
+		bool overlap_match,
+		bool variant_check);
diff --git a/src/diploidvariant.h b/src/diploidvariant.h
new file mode 100644
index 0000000..cd3acdd
--- /dev/null
+++ b/src/diploidvariant.h
@@ -0,0 +1,117 @@
+// data structure for direct search
+class DiploidVariant {
+    DiploidVariant(int pos_ = -1,
+        string ref_ = "",
+        vector<string> alts_ = {"",""},
+        bool heterozygous_ = false,
+        bool multi_alts_ = false,
+        int mdl_ = 0,
+        int mil_ = 0,
+        bool flag_ = false,
+        double qual_ = 0.0,
+        bool zero_one_var_ = false) :
+        pos(pos_),
+        ref(ref_),
+        alts(alts_),
+        heterozygous(heterozygous_),
+        multi_alts(multi_alts_),
+        mdl(mdl_),
+        mil(mil_),
+        flag(flag_),
+        qual(qual_),
+        zero_one_var(zero_one_var_){}
+    int pos;
+    string ref;
+    vector<string> alts;
+    bool heterozygous;
+    bool multi_alts;
+    bool zero_one_var; // which means the phasing should be 0/1 or 1/0, no matter if it contains multi_alts
+    // i.e. multi_alts does not mean that it is 1/2 or 2/1
+    int mdl;
+    int mil;
+    bool flag; //in DiploidVariant, flag = false is reference, flag = true is query
+    // keep flag as int? not necessary
+    double qual;
+//    int get_pos() const{return pos};
+//    string get_ref() const{return ref};
+//    vector<string> get_alts() const{return alts};
+//    bool get_heterozygous() const{return heterozygous};
+//    bool get_multi_alts() const{return multi_alts};
+    bool operator <(const DiploidVariant& y) const {
+        return pos < y.pos;
+    }
+    // this is based on the assumption that all sequence are in upper case
+    bool operator ==(const DiploidVariant& y) {
+        if (pos == y.pos && ref == y.ref) {
+            if(heterozygous == y.heterozygous && multi_alts == y.multi_alts){
+                if (multi_alts && heterozygous) {
+                    int match_times = 0;
+                    for (int i = 0; i < 2; i++) {
+                        for (int j = 0; j < 2; j++) {
+                            if (alts[i] == y.alts[j])
+                                match_times++;
+                        }
+                    }
+                    if (match_times >= 2)
+                        return true;
+                }
+                else if(alts[0] == y.alts[0]){
+                    return true;
+                }
+            }
+            if(multi_alts && zero_one_var && y.multi_alts && y.zero_one_var){
+                int match_times = 0;
+                for (int i = 0; i < 2; i++) {
+                    for (int j = 0; j < 2; j++) {
+                        if (alts[i] == y.alts[j])
+                            match_times++;
+                    }
+                }
+                if(match_times > 1) return true;
+            }
+        }
+        return false;
+    }
+    bool DirectCompare(const DiploidVariant& y){
+        if (pos == y.pos && ref == y.ref) {
+            if (multi_alts && heterozygous && y.multi_alts && y.heterozygous) {
+                int match_times = 0;
+                for (int i = 0; i < 2; i++) {
+                    for (int j = 0; j < 2; j++) {
+                        if (alts[i] == y.alts[j])
+                            match_times++;
+                    }
+                }
+                if (match_times > 0)
+                    return true;
+            }
+            else if(alts[0] == y.alts[0]){
+                return true;
+            }
+        }
+        return false;
+    }
+    bool CompareNoGenotype(const DiploidVariant & y){
+        if(pos == y.pos && ref == y.ref){
+            if(alts[0] == y.alts[0]) return true;
+            if(multi_alts){
+                if(alts[1] == y.alts[0]) return true;
+                if(y.multi_alts && alts[1] == y.alts[1]){
+                    return true;
+                }
+            }
+            if(y.multi_alts && alts[0] == y.alts[1]){
+                return true;
+            }
+        }
+        return false;
+    }
diff --git a/src/filter_cv.cpp b/src/filter_cv.cpp
new file mode 100644
index 0000000..87eb773
--- /dev/null
+++ b/src/filter_cv.cpp
@@ -0,0 +1,245 @@
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <tclap/CmdLine.h>
+#include "util.h"
+using namespace std;
+// all intervals are 0 based coordinate
+typedef struct Interval {
+    int start;
+    int end;
+    Interval() : start(0), end(0) {}
+    Interval(int s, int e) : start(s), end(e) {}
+struct compInterval {
+    bool operator()(const Interval &a, const Interval &b) const {
+        return a.start<b.start;
+    }
+typedef struct Args {
+	string baseline_filename;
+    vector<string> vcf_filenames;
+bool TclapParser(Args & args, int argc, char** argv){
+	string version = "0.9";
+	try {
+		std::string desc = "Please cite our paper if you are using this program in your research. \n";
+		TCLAP::CmdLine cmd(desc, ' ', version);
+		TCLAP::ValueArg<std::string> arg_baseline_filename("b", "baseline", "VCF file", true, "", "file");
+		TCLAP::MultiArg<std::string> arg_vcf_filenames("v", "vcf_files", "VCF file list", true, "file list");
+        cmd.add(arg_vcf_filenames);
+        cmd.add(arg_baseline_filename);
+		cmd.parse(argc, argv);
+		args.baseline_filename = arg_baseline_filename.getValue();
+		args.vcf_filenames = arg_vcf_filenames.getValue();
+	}
+	catch (TCLAP::ArgException &e)
+	{
+		std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+		abort();
+	}
+	return true;
+vector<Interval> merge(vector<Interval> &intervals) {
+    sort(intervals.begin(),intervals.end(),compInterval());
+    vector<Interval> results;
+    for(int i=0; i<intervals.size(); i++) {
+        if(results.empty() || results.back().end < intervals[i].start)  // no overlap
+            results.push_back(intervals[i]);
+        else   // overlap
+            results.back().end = max(results.back().end, intervals[i].end);
+    }
+    return results;
+int ReadWholeGenomeVariant(string filename,
+	vector<vector<Interval>> & interval_list_list,
+	vector<multimap<int, string>> & variant_hash_list,
+	map<string, int> & chrname_index)
+    int total_num = 0;
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return -1;
+	}
+	int genotype_index = -1;
+	char genotype_separator = '/';
+	int chr_num = 0;
+	//int genome_sequence_length = genome_sequence.length();
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+		//dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		//if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#') {
+			continue;
+		}
+		auto columns = split(line, '\t');		
+		string chr_name = columns[0];
+		if(chrname_index.find(chr_name) == chrname_index.end()){
+			chrname_index[chr_name] = chr_num;
+			chr_num++;
+			interval_list_list.push_back(vector<Interval>());
+			variant_hash_list.push_back(multimap<int, string>());
+		}
+		int chr_index = chrname_index[chr_name];
+		auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+		auto ref = columns[3];
+		int end_pos = pos + ref.size();
+		auto alt_line = columns[4];
+		bool is_heterozygous_variant = false;
+		bool is_multi_alternatives = false;
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos) {
+			alt_list = split(alt_line, ',');
+			is_multi_alternatives = true;
+		}
+		else {
+			alt_list.push_back(alt_line);
+		}
+        int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+        int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+        if(is_multi_alternatives){
+            snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+            snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+        }
+        if(snp_ins > 0 || snp_del > 0){
+            // this is an indel
+        	int interval_start = pos - 10;
+        	int interval_end = pos + 10;
+        	interval_list_list[chr_index].push_back(Interval(interval_start, interval_end));
+        }
+        variant_hash_list[chr_index].insert(make_pair(pos, line)); 
+        total_num++;
+	}
+	vcf_file.close();
+	return total_num;
+int FilterComplexVariant(int argc, char* argv[]){
+	string input_filename = string(argv[1]);
+	string output_filename = input_filename + ".cv.vcf";
+	vector<vector<Interval>> interval_list_list;
+	vector<multimap<int, string>> variant_hash_list;
+	map<string, int> chrname_index;
+	ReadWholeGenomeVariant(input_filename, 
+		interval_list_list, 
+		variant_hash_list,
+		chrname_index);
+	vector<vector<Interval>> merged_intervals_list;
+	for(auto interval_list : interval_list_list){
+		merged_intervals_list.push_back(merge(interval_list));
+	}
+	vector<map<int, int>> end_start_list;
+	for(auto merged_intervals : merged_intervals_list){
+		map<int, int> end_start;
+		for(auto i : merged_intervals){
+			end_start[i.end] = i.start;
+		}
+		end_start_list.push_back(end_start);
+	}
+	vector<unordered_map<int, string>> candidate_variant_hash_list;
+	vector<vector<int>> candidate_variant_pos_list;
+	cout << "filtering candidate variants..." << endl;
+	for(int k = 0; k < variant_hash_list.size(); k++){
+		cout << "filtering candidate variants on chromosome " << k << endl;
+		auto variant_hash = variant_hash_list[k];
+		auto end_start = end_start_list[k];
+		//cout << variant_hash.size() << "," << end_start.size() << endl;
+		unordered_map<int, string> candidate_variant_hash;
+		vector<int> candidate_variant_pos;
+		for(auto it = variant_hash.begin(); it != variant_hash.end(); ++it){
+			int varp = it->first;
+			auto lowit = end_start.lower_bound(varp);
+			if(lowit == end_start.end()) continue;
+			int interval_start = lowit->second;
+			int interval_end = lowit->first;
+			if(varp >= interval_start && varp < interval_end){
+				// candidate variant
+				candidate_variant_hash[varp] = it->second;
+				candidate_variant_pos.push_back(varp);
+			}
+		}
+		candidate_variant_hash_list.push_back(candidate_variant_hash);
+		candidate_variant_pos_list.push_back(candidate_variant_pos);
+	}
+	cout << "filtered all candidate variants." << endl;
+	ofstream cv_file;
+    cv_file.open(output_filename);
+    cout << "filtering complex variants..." << endl;
+    for(int k = 0; k < variant_hash_list.size(); k++){
+    	auto candidate_variant_pos = candidate_variant_pos_list[k];
+    	auto candidate_variant_hash = candidate_variant_hash_list[k];
+    	cout << "filtering complex variants on chromosome " << k << endl;
+		for(int i = 0; i < candidate_variant_pos.size(); i++){
+			int cur_pos = candidate_variant_pos[i];
+			if(i > 0){
+				int pre_pos = candidate_variant_pos[i-1];
+				if(cur_pos - pre_pos <= 10){
+					cv_file << candidate_variant_hash[cur_pos] << endl;
+					continue;
+				}
+			}
+			if(i < candidate_variant_pos.size() - 1){
+				int next_pos = candidate_variant_pos[i+1];
+				if(next_pos - cur_pos <= 10){
+					cv_file << candidate_variant_hash[cur_pos] << endl;
+					continue;
+				}
+			}
+		}
+	}
+	cout << "finished" << endl;
+	cv_file.close();
+int FiltetCandidateVariant(int argc, char* argv[]){
+    Args args;
+    TclapParser(args, argc, argv);
+    return 0;
+int main(int argc, char* argv[]){
+	return FilterComplexVariant(argc, argv);
\ No newline at end of file
diff --git a/src/filter_hc.cpp b/src/filter_hc.cpp
new file mode 100644
index 0000000..73016d3
--- /dev/null
+++ b/src/filter_hc.cpp
@@ -0,0 +1,158 @@
+#include <tclap/CmdLine.h>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include "util.h"
+using namespace std;
+typedef struct Args {
+	string bed_filename;
+    vector<string> vcf_filenames;
+    bool keep_outside;
+bool TclapParser(Args & args, int argc, char** argv){
+	string version = "0.9";
+	try {
+		std::string desc = "Please cite our paper if you are using this program in your research. \n";
+		TCLAP::CmdLine cmd(desc, ' ', version);
+		TCLAP::ValueArg<std::string> arg_bed_filename("b", "bedfile", "bedfile", true, "", "file");
+		TCLAP::MultiArg<std::string> arg_vcf_filenames("v", "vcf_files", "VCF file list", true, "file list");
+        string keep_variant_outside_string = "Keep variants outside, default keep variants inside. \n";
+        TCLAP::SwitchArg arg_keep_outside("o", "outside", keep_variant_outside_string, cmd, false);
+        cmd.add(arg_vcf_filenames);
+        cmd.add(arg_bed_filename);
+		cmd.parse(argc, argv);
+		args.bed_filename = arg_bed_filename.getValue();
+		args.vcf_filenames = arg_vcf_filenames.getValue();
+        args.keep_outside = arg_keep_outside.getValue();
+	}
+	catch (TCLAP::ArgException &e)
+	{
+		std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+		abort();
+	}
+	return true;
+void ReadBedfile(string bed_filename, 
+	map<string, int> & chrname_2_index, 
+	vector<map<int, int>> & chr_end_start){
+    int chr_num = 0;
+    ifstream input(bed_filename);
+    if(!input.good()){
+        cout << "[Error] Read bed file error" << endl;
+        return;
+    }
+    string line;
+    while( std::getline( input, line ).good() )
+    {
+    	if(line[0] == '#') continue;
+    	vector<string> columns = split(line, '\t');
+    	string chr_name = columns[0];
+    	if(chrname_2_index.find(chr_name) == chrname_2_index.end()){
+    		chrname_2_index[chr_name] = chr_num;
+    		map<int, int> temp;
+    		chr_end_start.push_back(temp);
+    		chr_num++;
+    	}
+    	int chr_index = chrname_2_index[chr_name];
+    	//cout << line << endl;
+    	int startp = stoi(columns[1]);
+    	int endp = stoi(columns[2]);
+    	chr_end_start[chr_index][endp] = startp;
+    }
+    cout << "finish reading bed file" << endl;
+    return;
+void FilterVcfFile(string vcf_filename, 
+	map<string, int> & chrname_2_index, 
+	vector<map<int, int>> & chr_end_start,
+    bool keep_outside){
+	string filter_filename = vcf_filename + ".lcr.vcf";
+	ifstream input(vcf_filename);
+	if(!input.good()){
+		cout << "[Error] Read vcf file " + vcf_filename + " error" << endl;
+		return;
+	}
+	vector<string> output_lines;
+	string line;
+    while( std::getline( input, line ).good() )
+    {
+    	if(line[0] == '#'){
+    		output_lines.push_back(line);
+    		continue;
+    	}
+    	vector<string> columns = split(line, '\t');
+    	string chr_name = columns[0];
+    	if(chrname_2_index.find(chr_name) == chrname_2_index.end()){
+    		output_lines.push_back(line);
+    		continue;
+    	}
+    	int chr_index = chrname_2_index[chr_name];
+    	int varp = stoi(columns[1])-1;
+    	map<int, int> & end_start = chr_end_start[chr_index];
+    	map<int, int>::iterator itlow, itup;
+    	itlow = end_start.lower_bound(varp);
+    	int startp = itlow->second;
+    	int endp = itlow->first;
+    	if(varp >= startp && varp< endp){
+            if(keep_outside){
+    		  continue;
+            }else{
+                output_lines.push_back(line);
+            }
+    	}else{
+            // variants are outside of bed region
+            if(!keep_outside){
+                // if you do not want to keep outside variants
+                continue;
+            }else{
+                output_lines.push_back(line);
+            }
+        }
+    }
+	ofstream filter_file;
+    filter_file.open(filter_filename);
+    for(auto line: output_lines){
+    	filter_file << line << endl;
+    }
+    filter_file.close();
+int main(int argc, char* argv[]){
+    Args args;
+    TclapParser(args, argc, argv);
+    vector<map<int, int>> chr_end_start;
+    map<string, int> chrname_2_index;
+    ReadBedfile(args.bed_filename, chrname_2_index, chr_end_start);
+    vector<string> vcf_filenames = args.vcf_filenames;
+    for(auto vcf_filename: vcf_filenames){
+    	FilterVcfFile(vcf_filename, chrname_2_index, chr_end_start, args.keep_outside);
+    }
+    return 0;
diff --git a/src/makefile b/src/makefile
new file mode 100644
index 0000000..3b23ec7
--- /dev/null
+++ b/src/makefile
@@ -0,0 +1,20 @@
+CXXFLAGS=-std=c++11 -pthread -g
+CXXFLAGS2=-I ../include
+all: vm-core
+vm-core: vm.cpp wholegenome.cpp util.cpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+	cp $@ ../$@
+filter_hc: filter_hc.cpp util.cpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+filter_cv: filter_cv.cpp util.cpp
+	$(CXX) $(CXXFLAGS) $(CXXFLAGS2) -o $@ $^
+	rm -f vm-core
+	rm -f *.o
diff --git a/src/removeduplicate.cpp b/src/removeduplicate.cpp
new file mode 100644
index 0000000..fdf9c4b
--- /dev/null
+++ b/src/removeduplicate.cpp
@@ -0,0 +1,456 @@
+#include "removeduplicate.h"
+RemoveDuplicate::RemoveDuplicate(int thread_num_):VCF(thread_num_){}
+int RemoveDuplicate::GetThreadIndex(int pos){
+    for(int i = 0; i < pos_boundries.size(); i++){
+        if(pos < pos_boundries[i]){
+            return i;
+        }
+    }
+int RemoveDuplicate::ReadVCFWithoutDup(string filename){
+    if(!boundries_decided){
+        cout << "[Error: RemoveDuplicate] ReadVCFWithoutDup can not read vcf file before read genome file" << endl;
+        return -1;
+    }
+    ifstream vcf_file;
+    vcf_file.open(filename.c_str());
+    if (!vcf_file.good()) {
+        cout << "[Error] RemoveDuplicate::ReadVCFWithoutDup can not open vcf file" << endl;
+        return -1;
+    }
+    int var_num = 0;
+    int nodup_var_num = 0;
+    while(!vcf_file.eof()){
+        string line;
+        getline(vcf_file, line, '\n');
+        if ((int)line.length() <= 1) continue;
+        if (line[0] == '#') continue;
+        auto columns = split(line, '\t');
+        if(chromosome_name == ".") chromosome_name = columns[0];
+        auto pos = atoi(columns[1].c_str()) - 1;
+        string ref = columns[3];
+        string alt = columns[4];
+        string quality = columns[6];
+        vector<string> alt_list;
+        if(alt.find(",") != string::npos){
+            continue;
+            // deal with multi alt
+            alt_list = split(alt, ',');
+        }else{
+            alt_list.push_back(alt);
+        }
+        //int thread_index = GetThreadIndex(pos);
+        char snp_type;
+        for (auto it = alt_list.begin(); it != alt_list.end(); ++it){
+            snp_type = 'S';
+            string a = *it;
+            if((int)ref.length() > (int)alt.length()){
+                snp_type = 'D';
+            }else if((int)ref.length() < (int)alt.length()){
+                snp_type = 'I';
+            }
+            var_num ++;
+            string varid = to_string(pos) + "_" + ref + "_" + a;
+            transform(varid.begin(), varid.end(), varid.begin(), ::toupper);
+            //dout << varid << endl;
+            if(nondup_vcfentry_hash.find(varid) == nondup_vcfentry_hash.end()){
+                nodup_var_num ++;
+                nondup_vcfentry_hash[varid] = line;
+                nondup_pos_snp_map[pos].push_back(SNP(pos, snp_type, ref, a));
+            }
+        }
+    }
+    vcf_file.close();
+    return var_num;
+void RemoveDuplicate::ClusteringSnps() {
+    //int num = 0;
+    //dout << nondup_pos_snp_map.size() << endl;
+    for (auto it = nondup_pos_snp_map.begin(); it != nondup_pos_snp_map.end(); ++it) {
+        auto & v = it->second;
+        for (int k = 0; k < v.size(); k++) {
+            data_list.push_back(v[k]);
+        }
+    }
+    if (data_list.size() == 0)
+        return;
+    sort(data_list.begin(), data_list.end());
+    int cluster_index = 0;
+    int ins_total = 0;
+    int del_total = 0;
+    int c_start = 0;
+    int c_end = 0;
+    for (int i = 0; i < data_list.size(); i++) {
+        auto snp = data_list[i];
+        // check if need to separator clusters
+        if (i > 0) {
+            c_end = snp.pos;
+            if(c_end-c_start >= 2){
+                string separator = genome_sequence.substr(c_start, c_end - c_start);
+                int max_change = max(ins_total, del_total);
+                if ((int)separator.length() > 2 * max_change &&
+                    ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change))) 
+                {
+                    cluster_index++;
+                    ins_total = 0;
+                    del_total = 0;
+                    c_start = 0; // re-assign c_start
+                }
+            }
+        }
+        if(c_start < snp.pos + (int)snp.ref.length()) c_start = snp.pos + (int)snp.ref.length();
+        // assign snp to cluster
+        cluster_snps_map[cluster_index].push_back(snp);
+        int ref_length = (int)snp.ref.length();
+        int alt_length = (int)snp.alt.length();
+        int diff_length = abs(ref_length - alt_length);
+        if (snp.snp_type == 'I') {
+            ins_total += diff_length;
+        }
+        else if (snp.snp_type == 'D') {
+            del_total += diff_length;
+        }
+    }
+void RemoveDuplicate::DivisiveHierarchicalClustering(list<vector<SNP> > & snp_clusters){
+    // 
+    if(snp_clusters.size() == 0) return;
+    bool flag = true;
+    list<bool> potential_list;
+    for(int i = 0; i < snp_clusters.size(); i++){
+        potential_list.push_back(true);
+    }
+    while(flag){
+        flag = false;
+        int list_size = snp_clusters.size();
+        for(int i = 0; i < list_size; i++){
+            auto front_cluster = snp_clusters.front();
+            auto front_posential = potential_list.front();
+            snp_clusters.pop_front();
+            potential_list.pop_front();
+            if(! front_posential){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(front_posential);
+                continue;
+            }
+            int max_start = -1;
+            int max_end = -1;
+            int max_length = -1;
+            int start = front_cluster[0].pos + (int)front_cluster[0].ref.length();
+            // find the largest gap, see if we can separate from that gap
+            for(int k = 0; k < front_cluster.size(); k++){
+                auto snp = front_cluster[k];
+                auto snp_pos = snp.pos;
+                if(max_length < snp_pos - start){
+                    max_length = snp_pos - start;
+                    max_start = start;
+                    max_end = snp_pos;
+                }
+            }
+            if(max_length <= 0){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+            int left_ins = 0;
+            int left_del = 0;
+            int right_ins = 0;
+            int right_del = 0;
+            vector<SNP> left_snp_list;
+            vector<SNP> right_snp_list;
+            string separator = genome_sequence.substr(max_start, max_end-max_start);
+            for(int k = 0; k < front_cluster.size(); k++){
+                auto snp = front_cluster[k];
+                int snp_diff = abs((int)snp.ref.length() - (int)snp.alt.length());
+                if(snp.pos <= max_start){
+                    if(snp.snp_type == 'I'){
+                        left_ins += snp_diff;
+                    }else if(snp.snp_type == 'D'){
+                        left_del += snp_diff;
+                    }
+                    left_snp_list.push_back(snp);
+                }else{
+                    if(snp.snp_type == 'I'){
+                        right_ins += snp_diff;
+                    }else if(snp.snp_type == 'D'){
+                        right_del += snp_diff;
+                    }
+                    right_snp_list.push_back(snp);
+                }
+            }
+            //check
+            if(left_snp_list.size() == 0 || right_snp_list.size() == 0){
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+            vector<int> change_list = {left_ins, left_del, right_ins, right_del};
+            int max_change = 0;
+            for(int k = 0; k < change_list.size(); k++){
+                if (max_change < change_list[k]) max_change = change_list[k];
+            }
+            if ((int)separator.length() > 2 * max_change &&
+                    ((int)separator.length() > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+            {
+                flag = true;
+                snp_clusters.push_back(left_snp_list);
+                potential_list.push_back(true);
+                snp_clusters.push_back(right_snp_list);
+                potential_list.push_back(true);
+            }else{
+                snp_clusters.push_back(front_cluster);
+                potential_list.push_back(false);
+                continue;
+            }
+        }
+    }
+    return;
+bool RemoveDuplicate::FindOneMatch(vector<SNP> & snp_list, 
+    const string subsequence,
+    int offset,
+    int thread_index)
+    if(snp_list.size() <= 1) return false;
+    unordered_map<string, vector<SNP>> donor_snps;
+    for(int i = 1; i < snp_list.size(); i++){
+        vector<vector<SNP> > combinations = CreateCombinations(snp_list, i);
+        for(int k = 0; k < combinations.size(); k++){
+            vector<SNP> comb = combinations[k];
+            if(CheckVariantOverlap(comb)) continue;
+            string alt_sequence = ModifySequenceBySnpList(subsequence, comb, offset);
+            //dout << alt_sequence << endl;
+            if(donor_snps.find(alt_sequence) != donor_snps.end()){
+                string matching_result = "";
+                matching_result += chromosome_name;
+                string parsimonious_ref = subsequence;
+                string parsimonious_alt = alt_sequence;
+                if(parsimonious_ref == parsimonious_alt){
+                    dout << "[Error:RemoveDuplicate::FindOneMatch] in variant, ref == alt";
+                }
+                int min_parsimonious_len = min(parsimonious_ref.size(), parsimonious_alt.size());
+                int chop_left = 0;
+                int chop_right = 0;
+                for(int i = 0; i < min_parsimonious_len; i++){
+                    if(toupper(parsimonious_ref[i]) == toupper(parsimonious_alt[i])){
+                        chop_left ++;
+                    }else{
+                        break;
+                    }
+                }
+                for(int i = min_parsimonious_len-1; i >= 0; i--){
+                    if(toupper(parsimonious_ref[i]) == toupper(parsimonious_alt[i])){
+                        chop_right ++;
+                    }else{
+                        break;
+                    }
+                }
+                // 1-based
+                if ((int)parsimonious_ref.length() - chop_left - chop_right == 0 || (int)parsimonious_alt.length() - chop_left - chop_right == 0)
+                    chop_left --;
+                matching_result += "\t" + to_string(chop_left + offset + 1);
+                parsimonious_ref = parsimonious_ref.substr(chop_left, (int)parsimonious_ref.length() - chop_left - chop_right);
+                parsimonious_alt = parsimonious_alt.substr(chop_left, (int)parsimonious_alt.length() - chop_left - chop_right);
+                matching_result += "\t" + parsimonious_ref + "\t" + parsimonious_alt;
+                string set_matching_string = "";
+                for(int m = 0; m < comb.size(); m++){
+                    auto m_snp = comb[m];
+                    for(auto it = snp_list.begin(); it != snp_list.end(); ++it){
+                        auto del_snp = *it;
+                        if(m_snp.snp_type == del_snp.snp_type && m_snp.pos == del_snp.pos && m_snp.ref == del_snp.ref && m_snp.alt == del_snp.alt){
+                            snp_list.erase(it);
+                            break;
+                        }
+                    }
+                    set_matching_string += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+                }
+                matching_result += "\t"+set_matching_string;
+                set_matching_string = "";
+                for(int m = 0; m < donor_snps[alt_sequence].size(); m++){
+                    auto m_snp = donor_snps[alt_sequence][m];
+                    for(auto it = snp_list.begin(); it != snp_list.end(); ++it){
+                        auto del_snp = *it;
+                        if(m_snp.snp_type == del_snp.snp_type && m_snp.pos == del_snp.pos && m_snp.ref == del_snp.ref && m_snp.alt == del_snp.alt){
+                            snp_list.erase(it);
+                            break;
+                        }
+                    }
+                    set_matching_string += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+                }
+                matching_result += "\t"+set_matching_string + "\n";
+                complex_match_records[thread_index]->push_back(matching_result);
+                return true;
+            }else{
+                donor_snps[alt_sequence] = comb;
+            }
+        }
+    }
+    return false;
+void RemoveDuplicate::FindMatches(vector<SNP> snp_list, int thread_index){
+    if(snp_list.size() <= 1) return;
+    int min_pos = 0;
+    int max_pos = 0;
+    sort(snp_list.begin(), snp_list.end());
+    min_pos = snp_list[0].pos;
+    for(int i = 0; i < snp_list.size(); i++){
+        int temp_pos = snp_list[i].pos + (int)snp_list[i].ref.length();
+        if(max_pos < temp_pos) max_pos = temp_pos;
+    }
+    min_pos = max(0, min_pos - 1);
+    max_pos = min(max_pos + 1, (int)genome_sequence.length());
+    string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+    while(snp_list.size() > 1 && FindOneMatch(snp_list, subsequence, min_pos, thread_index));
+void RemoveDuplicate::ClusteringRemoveDuplicateInThread(int start, int end, int thread_index){
+    for (int cluster_id = start; cluster_id < end; cluster_id++) {
+        if (cluster_snps_map.find(cluster_id) == cluster_snps_map.end()) continue;
+        auto & snp_list = cluster_snps_map[cluster_id];
+        if(snp_list.size() <= 1) continue;
+        //dout << snp_list.size() << endl;
+        if(snp_list.size() > 20){
+            //dout << "DivisiveHierarchicalClustering" << endl;
+            list<vector<SNP> > snp_clusters = {snp_list};
+            DivisiveHierarchicalClustering(snp_clusters);
+            for(auto it=snp_clusters.begin(); it != snp_clusters.end(); it++){
+                //FindMatches for snp_clusters[i]
+                FindMatches(*it, thread_index);
+            }
+        }else{
+            //FindMatches
+            FindMatches(snp_list, thread_index);
+        }
+    }
+void RemoveDuplicate::ClusteringRemoveDuplicateMultiThread(){
+    int start = cluster_snps_map.begin()->first;
+    int cluster_number = cluster_snps_map.size();
+    int cluster_end_boundary = start + cluster_number;
+    int cluster_step = cluster_number / thread_num;
+    if (cluster_step * thread_num < cluster_number) cluster_step++;
+    int end = start + cluster_step;
+    //dout << start << "\t" << end << "\t" << cluster_number << "\t" << cluster_step << endl;
+    //initialize vector size, each allocating will have a lock
+    complex_match_records = new vector<string>* [thread_num];
+    for(int j = 0; j < thread_num; j++){
+        complex_match_records[j] = new vector<string>;
+    }
+    vector<thread> threads;
+    //spawn threads
+    unsigned i = 0;
+    for (; i < thread_num - 1; i++) {
+        //threads.push_back(thread(f));
+        //dout << "create new thread" << endl;
+        int variant_number = 0;
+        for (int cluster_id = start; cluster_id < end; cluster_id++) {
+            if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+                variant_number += cluster_snps_map[cluster_id].size();
+            }
+        }
+        threads.push_back(thread(&RemoveDuplicate::ClusteringRemoveDuplicateInThread, this, start, end, i));
+        start = end;
+        end = start + cluster_step;
+        //dout << start << "\t" << end << "\t" << cluster_number << "\t" << cluster_step << endl;
+    }
+    // also you need to do a job in main thread
+    // i equals to (thread_num - 1)
+    if (i != thread_num - 1) {
+        dout << "[Error] thread number not match" << endl;
+    }
+    if (start >= cluster_snps_map.size()) {
+        dout << "[Error] index out of map range" << endl;
+    }
+    else {
+        int variant_number = 0;
+        for (int cluster_id = start; cluster_id < end; cluster_id++) {
+            if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+                variant_number += cluster_snps_map[cluster_id].size();
+            }
+        }
+        ClusteringRemoveDuplicateInThread(start, end, i);
+    }
+    // call join() on each thread in turn before this function?
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    ofstream output_complex_file;
+    output_complex_file.open(output_complex_filename);
+    output_complex_file << "#CHR\tPOS\tREF\tALT\tSet1\tSet2" << endl;
+    for(int i = 0; i < thread_num; i++){
+        for (int j = 0; j < complex_match_records[i]->size(); j++){
+            if(complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos){
+                output_complex_file << complex_match_records[i]->at(j);
+            }
+        }
+    }
+    output_complex_file.close();
+void RemoveDuplicate::Deduplicate(string vcf_filename,
+            string genome_filename,
+            bool direct_search,
+            string output_prefix)
+    //dout << output_prefix << endl;
+    output_stat_filename = output_prefix + ".stat";
+    output_simple_filename = output_prefix + ".simple";
+    output_complex_filename = output_prefix + ".complex";
+        //------------read genome sequence and decide boundary according to thread number
+    dsptime();
+    dout << " Read genome sequence file... " << endl;
+    ReadGenomeSequence(genome_filename);
+    dsptime();
+    dout << " Finish reading genome sequence file." << endl;
+    dsptime();
+    dout << " Read vcf file and remove simple duplications... " << endl;
+    ReadVCFWithoutDup(vcf_filename);
+    dsptime();
+    //-------------clustering search
+    dsptime();
+    dout << " Clustering snps ... " << endl;
+    ClusteringSnps();
+    dsptime();
+    dout << " Finish clustering." << endl;
+    dsptime();
+    dout << " Detect complex duplications..." << endl;
+    ClusteringRemoveDuplicateMultiThread();
+    dsptime();
+    dout << " Output complex duplications..." << endl;
+    return;
diff --git a/src/removeduplicate.h b/src/removeduplicate.h
new file mode 100644
index 0000000..837bc16
--- /dev/null
+++ b/src/removeduplicate.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "vcf.h"
+typedef unordered_map<string, string> VCFEntryHash;
+class RemoveDuplicate: public VCF
+    map<int, vector<SNP> > nondup_pos_snp_map;
+    VCFEntryHash nondup_vcfentry_hash; // id is pos_ref_alt with uppercase
+    int GetThreadIndex(int pos);
+    int ReadVCFWithoutDup(string filename);
+    void ClusteringRemoveDuplicateInThread(int start, int end, int thread_index);
+    void ClusteringRemoveDuplicateMultiThread();
+    void ClusteringSnps() override;
+    void DivisiveHierarchicalClustering(list<vector<SNP>>& snp_clusters);
+    void FindMatches(vector<SNP> snp_list, int thread_index);
+    bool FindOneMatch(vector<SNP> & snp_list, const string subsequence, int offset, int thread_index);
+    RemoveDuplicate(int thread_num_);
+    ~RemoveDuplicate();
+    void Deduplicate(string vcf_filename,
+            string genome_filename,
+            bool direct_search,
+            string output_prefix);
diff --git a/src/splitvcf.cpp b/src/splitvcf.cpp
new file mode 100644
index 0000000..2eed461
--- /dev/null
+++ b/src/splitvcf.cpp
@@ -0,0 +1,30 @@
+//#include "stdafx.h"
+#include "splitvcf.h"
+SplitVcf::SplitVcf(int argc, char* argv) {
+	version = "0.9";
+	try {
+		std::string desc = "split vcf file according to chromosome. \n";
+		TCLAP::CmdLine cmd(desc, ' ', version);
+		//TCLAP::ValueArg<std::string> arg_input_vcf_file("i", "i", "input VCF file", true, "", "file", cmd);
+		TCLAP::UnlabeledValueArg<std::string> arg_input_vcf_file("<in.vcf>", "input VCF file", true, "", "file", cmd);
+		TCLAP::ValueArg<std::string> arg_genome_list_file("g", "g", "genome list file", true, "", "file", cmd);
+		cmd.parse(argc, argv);
+	}
+	catch (TCLAP::ArgException &e)
+	{
+		std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+		abort();
+	}
+SplitVcf::~SplitVcf() {
+int main(int argc, char* argv[]) {
+	SplitVcf sf = new SplitVcf(argc, argv);
+	sf->Split();
diff --git a/src/splitvcf.h b/src/splitvcf.h
new file mode 100644
index 0000000..47af518
--- /dev/null
+++ b/src/splitvcf.h
@@ -0,0 +1,15 @@
+#include <iostream>
+#include "util.h"
+#include <tclap/CmdLine.h>
+class SplitVcf
+	std::string genome_list_filename;
+	std::string vcf_filename;
+	SplitVcf(int argc, char* argv);
+	~SplitVcf();
+	bool Split();
diff --git a/src/test.py b/src/test.py
new file mode 100644
index 0000000..a4c8bcd
--- /dev/null
+++ b/src/test.py
@@ -0,0 +1 @@
+print 'h'
\ No newline at end of file
diff --git a/src/threadguard.cpp b/src/threadguard.cpp
new file mode 100644
index 0000000..e5a512d
--- /dev/null
+++ b/src/threadguard.cpp
@@ -0,0 +1,9 @@
+#include "threadguard.h"
+	if (t.joinable()) {
+		t.join();
+	}
diff --git a/src/threadguard.h b/src/threadguard.h
new file mode 100644
index 0000000..1235e9d
--- /dev/null
+++ b/src/threadguard.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <thread>
+using namespace std;
+class ThreadGuard
+	thread & t;
+	explicit ThreadGuard(thread& t_) : t(t_) {
+	}
+	~ThreadGuard();
+	ThreadGuard(ThreadGuard const &) = delete;
+	ThreadGuard& operator=(ThreadGuard const&) = delete;
diff --git a/src/util.cpp b/src/util.cpp
new file mode 100644
index 0000000..42021d9
--- /dev/null
+++ b/src/util.cpp
@@ -0,0 +1,20 @@
+#include "util.h"
+/*split function*/
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+	std::stringstream ss(s);
+	std::string item;
+	while (std::getline(ss, item, delim)) {
+		if (!item.empty()) {
+			elems.push_back(item);
+		}
+	}
+	return elems;
+/*This split function only support char as delim, string as delim please boost split function*/
+std::vector<std::string> split(const std::string &s, char delim) {
+	std::vector<std::string> elems;
+	split(s, delim, elems);
+	return elems;
\ No newline at end of file
diff --git a/src/util.h b/src/util.h
new file mode 100644
index 0000000..4dcb105
--- /dev/null
+++ b/src/util.h
@@ -0,0 +1,54 @@
+//#ifndef UTILITIES_H
+//#define UTILITIES_H
+#pragma once
+#define DEBUG
+#ifdef DEBUG
+#define dout cout
+#define dout 0 && cout
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <cstring>
+#include <sys/stat.h>
+#include <cassert>
+#include <cstdlib>
+using namespace std;
+/*split function*/
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems);
+/*This split function only support char as delim, string as delim please boost split function*/
+std::vector<std::string> split(const std::string &s, char delim);
+inline bool FileExists (const std::string& name) {
+  struct stat buffer;
+  return (stat (name.c_str(), &buffer) == 0);
+template <typename T>
+string ToString(const T & number){
+	string String = static_cast<ostringstream*>( &(ostringstream() << number) )->str();
+	return String;
+inline void dsptime()
+ time_t nowtime;
+ //nowtime = time(NULL); //get int time number
+ time(&nowtime); // get current time
+ struct tm * ptm=localtime(&nowtime);  //convert time to local time
+ cout << ptm->tm_mon+1 << "/" << ptm->tm_mday << "/"<< ptm->tm_year+1900 << "," ;
+ cout << ptm->tm_hour << ":" << ptm->tm_min << ":" << ptm->tm_sec <<" ";
diff --git a/src/vcf.cpp b/src/vcf.cpp
new file mode 100644
index 0000000..a510bcd
--- /dev/null
+++ b/src/vcf.cpp
@@ -0,0 +1,1230 @@
+#include "vcf.h"
+bool operator <(const SNP& x, const SNP& y) {
+	return x.pos < y.pos;
+bool operator ==(const SNP& x, const SNP& y) {
+	if (x.pos == y.pos && x.snp_type == y.snp_type && x.alt == y.alt && x.genotype == y.genotype) {
+		return true;
+	}
+	return false;
+VCF::VCF(int thread_num_)
+    debug_f = 0;
+	genome_sequence = "";
+	boundries_decided = false;
+	clustering_search = false;
+	match_genotype = true;
+    if (thread_num_ <= 0) {
+		thread_num = 1;
+	}
+	else {
+		thread_num = min(thread_num_, (int)thread::hardware_concurrency());
+	}
+	dout << "VCF() Thread Number: " << thread_num << endl;
+    chromosome_name = ".";
+// protected
+bool VCF::NormalizeSnp(int pos, string ref, string alt, string & parsimonious_ref, string & parsimonious_alt) {
+	parsimonious_ref = ref;
+	parsimonious_alt = alt;
+	int left_index = pos;
+	if (genome_sequence.size() == 0) return false;
+	//if (parsimonious_ref.size() == 1 || parsimonious_alt.size() == 1) return true;
+	if (toupper(genome_sequence[left_index]) != toupper(parsimonious_ref[0])) {
+		dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+		return false;
+	}
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt.back())) {
+			if((parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) || left_index > 0){
+                parsimonious_ref.pop_back();
+				parsimonious_alt.pop_back();
+				change_in_allels = true;
+			}
+			else {
+				return false;
+			}
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt.length() == 0) {
+			left_index--;
+			char left_char = genome_sequence[left_index];
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt = left_char + parsimonious_alt;
+		}
+	}
+	while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt[0]) && parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt.erase(0, 1);
+	}
+    //if(parsimonious_ref != ref){
+    //    cout << ref << "," << alt << "," << parsimonious_ref << "," << parsimonious_alt << endl;
+    //}
+	return true;
+// private
+void VCF::ReadVCF(string filename, SnpHash & pos_2_snp) {
+	if (!boundries_decided) {
+		dout << "[Error] VCF::ReadVCF cannot read vcf file before read genome file" << endl;
+		return;
+	}
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return;
+	}
+    if(normalization){
+        dout << "normalize while read" << endl;
+    }
+    string previous_line;
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+        //dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#'){
+			if(line[1] == '#') continue;
+			auto head_names = split(line, '\t');
+			if(match_genotype && head_names.size() < 10){
+				cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+				cout << "[VarMatch] \tVCF file name " << filename << endl;
+				cout << "[VarMatch] \tAutomatically turn off genotype matching module." << endl;
+				match_genotype = false;
+			}
+			continue;
+		}
+		auto columns = split(line, '\t');
+		if(match_genotype && columns.size() < 10){
+			cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+			cout << "[VarMatch] \tskip current variant " << filename << endl;
+			continue;
+		}
+		if(chromosome_name == ".") chromosome_name = columns[0];
+        auto pos = atoi(columns[1].c_str()) - 1;
+		auto ref = columns[3];
+		auto alt_line = columns[4];
+		auto quality = columns[6];
+		if (ref == ".") ref = "";
+        if (alt_line == ".") alt_line = "";
+		//decide which thread to use
+		int thread_index = 0;
+		for (int i = 0; i < pos_boundries.size(); i++) {
+			if (pos < pos_boundries[i]) {
+				thread_index = i;
+				break;
+			}
+		}
+		int genotype_index = -1;
+		string genotype = "1/1";
+		vector<string> genotype_columns;
+		if (match_genotype){
+			auto formats = split(columns[8], ':');
+			for(int i = 0; i < formats.size(); i++){
+				if(formats[i] == "GT"){
+					genotype_index = i;
+					break;
+				}
+			}
+			if (genotype_index < 0) {
+				cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+				cout << "[VarMatch] \tskip current variant " << filename << endl;
+				continue;
+			}
+			auto additionals = split(columns[9], ':');
+			genotype = additionals[genotype_index];
+			if(genotype.find("/") != std::string::npos){
+				genotype_columns = split(genotype, '/');
+			}else if(genotype.find("|") != std::string::npos){
+				genotype_columns = split(genotype, '|');
+			}else{
+				cout << "[VarMatch] Error: Unrecognized Genotype: " << genotype << endl;
+				continue;
+			}
+			// normalize format of genotype: sorted, separated by |
+			if(genotype_columns.size() != 2){
+				cout << "[VarMatch] Warning Unrecognized Genotype: " << genotype << endl;
+                //cout << genotype_columns.size() << endl;
+                //dout << line << endl;
+			}else{
+				sort(genotype_columns.begin(), genotype_columns.end());
+				genotype = genotype_columns[0]+"|"+genotype_columns[1];
+			}
+		}
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos){
+			alt_list = split(alt_line, ',');
+		}else{
+			alt_list.push_back(alt_line);
+		}
+		if(!match_genotype){
+			for(auto alt_it = alt_list.begin(); alt_it != alt_list.end(); ++alt_it){
+				string alt = *alt_it;
+				char snp_type = 'S';
+				if ((int)ref.length() > (int)alt.length()) {
+					snp_type = 'D';
+				}
+				else if ((int)ref.length() < (int)alt.length()) {
+					snp_type = 'I';
+				}
+				if (normalization) {
+					string norm_ref, norm_alt;
+					NormalizeSnp(pos, ref, alt, norm_ref, norm_alt);
+					pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type, norm_ref, norm_alt, genotype));
+				}
+				else {
+					pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type, ref, alt, genotype));
+				}
+			}
+		}else{
+			//append variants according to genotype
+            if(genotype == "0|0") continue;
+            vector<char> snp_type_list;
+            for(int i = 0; i < alt_list.size(); i++){
+            	string alt = alt_list[i];
+				char snp_type = 'S';
+				if ((int)ref.length() > (int)alt.length()) {
+					snp_type = 'D';
+				}
+				else if ((int)ref.length() < (int)alt.length()) {
+					snp_type = 'I';
+				}
+				snp_type_list.push_back(snp_type);
+            }
+			if (alt_list.size() > 1) {
+				haplotype_matching_check[thread_index][pos] = 0;
+			}
+            int genotype_val = atoi(genotype_columns[0].c_str()) - 1;
+            if(genotype_val >= 0){ // if genotype == -1, it is reference which does not need to be added
+				if (genotype_val >= alt_list.size()) {
+					cout << "[VarMatch] Warning: Unrecognized Genotype. " << genotype_val << endl;
+				}
+				if (normalization) {
+					string norm_ref, norm_alt;
+					NormalizeSnp(pos, ref, alt_list[genotype_val], norm_ref, norm_alt);
+					pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], norm_ref, norm_alt, genotype));
+				}
+				else {
+					pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], ref, alt_list[genotype_val], genotype));
+				}
+            }
+            if(genotype_columns[0] != genotype_columns[1]){
+            	// add another alt, one genotype corresponding to one alt
+            	genotype_val = atoi(genotype_columns[1].c_str()) - 1;
+            	if(genotype_val >= 0){
+					if (genotype_val >= alt_list.size()) {
+						cout << "[VarMatch] Warning: Unrecognized Genotype[2]. " << genotype_val << endl;
+					}
+					if (normalization) {
+						string norm_ref, norm_alt;
+						NormalizeSnp(pos, ref, alt_list[genotype_val], norm_ref, norm_alt);
+						pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], norm_ref, norm_alt, genotype));
+					}
+					else {
+						pos_2_snp[thread_index][pos].push_back(SNP(pos, snp_type_list[genotype_val], ref, alt_list[genotype_val], genotype));
+					}
+            	}
+            }
+		}
+	}
+	vcf_file.close();
+	return;
+// protected
+void VCF::ReadGenomeSequence(string filename) {
+	ifstream genome_file;
+	genome_file.open(filename.c_str());
+	if (!genome_file.good()) {
+		cout << "[VarMatch] can not open FASTA file: ";
+		cout << filename << endl;
+		return;
+	}
+	genome_sequence = "";
+	while(!genome_file.eof()) {
+		string line;
+		getline(genome_file, line, '\n');
+		if ((int)line.length() <= 1) continue;
+		if (line[0] == '>') continue;
+		genome_sequence += line;
+	}
+	genome_file.close();
+	// boundries can get after knowing genome sequence.
+	DecideBoundaries();
+	return;
+// protected
+void VCF::DecideBoundaries() {
+	int genome_size = genome_sequence.size();
+	int distance = genome_size / thread_num;
+	for (int i = 0; i < thread_num - 1; i++) {
+		pos_boundries.push_back((i + 1)*distance);
+	}
+	pos_boundries.push_back(genome_size);
+	// initialize two for copy
+	unordered_map<int, vector<SNP> > ref_h;
+	unordered_map<int, vector<SNP> > que_h;
+	map<int, vector<SNP> > ref_m;
+	map<int, vector<SNP> > que_m;
+	for (int i = 0; i < thread_num; i++) {
+		refpos_2_snp.push_back(ref_h);
+		querypos_2_snp.push_back(que_h);
+		refpos_snp_map.push_back(ref_m);
+		querypos_snp_map.push_back(que_m);
+	}
+	boundries_decided = true;
+// private
+void VCF::ReadRefVCF(string filename) {
+	ReadVCF(filename, this->refpos_2_snp);
+// private
+void VCF::ReadQueryVCF(string filename) {
+	ReadVCF(filename, this->querypos_2_snp);
+// protected
+bool VCF::CompareSnps(SNP r, SNP q) {
+	if(r.pos != q.pos) return false;
+	// directly match genotype
+	if(match_genotype && r.genotype != q.genotype) return false;
+    auto ref_ref = r.ref;
+	transform(ref_ref.begin(), ref_ref.end(), ref_ref.begin(), ::toupper);
+	auto ref_alt = r.alt;
+	transform(ref_alt.begin(), ref_alt.end(), ref_alt.begin(), ::toupper);
+	auto que_ref = q.ref;
+	transform(que_ref.begin(), que_ref.end(), que_ref.begin(), ::toupper);
+	auto que_alt = q.alt;
+	transform(que_alt.begin(), que_alt.end(), que_alt.begin(), ::toupper);
+	if (ref_ref == que_ref && ref_alt == que_alt) return true;
+	return false;
+void VCF::DirectSearchInThread(unordered_map<int, vector<SNP> > & ref_snps, unordered_map<int, vector<SNP> > & query_snps, int thread_index) {
+	// handle heterozygous variants
+    auto rit = ref_snps.begin();
+	auto rend = ref_snps.end();
+	for (; rit != rend;) {
+		auto r_pos = rit->first;
+		auto & r_snps = rit->second;
+		auto qit = query_snps.find(r_pos);
+		if (qit != query_snps.end()) {
+			auto & q_snps = qit->second;
+			for (auto r_snp_it = r_snps.begin(); r_snp_it != r_snps.end(); ) {
+				bool matched_r_snp = false;
+				for (auto q_snp_it = q_snps.begin(); q_snp_it != q_snps.end(); ) {
+					if (CompareSnps(*r_snp_it, *q_snp_it)) {
+                        auto temp_snp = *r_snp_it;
+                        string matching_result = chromosome_name + '\t' + to_string(temp_snp.pos+1) + "\t" + temp_snp.ref + "\t" + temp_snp.alt;
+                        direct_match_records[thread_index]->push_back(matching_result);
+						matched_r_snp = true;
+						q_snps.erase(q_snp_it);
+						break;
+					}
+					else {
+						++q_snp_it;
+					}
+				}
+				if (matched_r_snp) {
+					r_snps.erase(r_snp_it);
+				}
+				else {
+					++r_snp_it;
+				}
+			}
+			if (r_snps.size() == 0) {
+				rit = ref_snps.erase(rit);
+			}
+			else {
+				++rit;
+			}
+			if (q_snps.size() == 0) {
+				query_snps.erase(qit);
+			}
+		}else{
+            ++rit;
+        }
+	}
+// directly match by position
+// private
+void VCF::DirectSearchMultiThread() {
+    direct_match_records = new vector<string>* [thread_num];
+    for(int j = 0; j < thread_num; j++){
+        direct_match_records[j] = new vector<string>;
+    }
+	vector<thread> threads;
+	//spawn threads
+	int i = 0;
+	for (; i < thread_num - 1; i++) {
+		threads.push_back( thread(&VCF::DirectSearchInThread, this, ref(refpos_2_snp[i]), ref(querypos_2_snp[i]), i));
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	DirectSearchInThread(refpos_2_snp[i], querypos_2_snp[i],i);
+	// call join() on each thread in turn before this function?
+	std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    threads.clear();
+    ofstream output_simple_file;
+    output_simple_file.open(output_simple_filename);
+    output_simple_file << "##VCF1:" << ref_vcf_filename << endl;
+    output_simple_file << "##VCF2:" << que_vcf_filename << endl;
+    output_simple_file << "#CHROM\tPOS\tREF\tALT" << endl;
+    for(int i = 0; i < thread_num; i++){
+        for (int j = 0; j < direct_match_records[i]->size(); j++){
+            output_simple_file << direct_match_records[i]->at(j) << endl;
+        }
+    }
+    output_simple_file.close();
+    for(int j = 0; j < thread_num; j++){
+        delete direct_match_records[j];
+    }
+    delete [] direct_match_records;
+// protected
+string VCF::ModifySequenceBySnp(const string sequence, SNP s, int offset) {
+	string result = "";
+	int snp_pos = s.pos - offset;
+	int snp_end = snp_pos + (int)s.ref.length();
+	if(snp_end > (int)sequence.length()){
+        dout << "[Error] snp end greater than sequence length" << endl;
+    }
+	result += sequence.substr(0, snp_pos);
+	result += s.alt;
+	result += sequence.substr(snp_end, sequence.length() - snp_end);
+	transform(result.begin(), result.end(), result.begin(), ::toupper);
+	return result;
+// protected
+string VCF::ModifySequenceBySnpList(const string sequence, vector<SNP> s, int offset) {
+	string result = sequence;
+	int start_pos = 0;
+    if(s.size() == 1){
+        return ModifySequenceBySnp(sequence, s[0], offset);
+    }
+    sort(s.begin(), s.end());
+	for (int i = s.size()-1; i >= 0; i--) {
+		int snp_pos = s[i].pos - offset;
+		int snp_end = snp_pos + (int)s[i].ref.length();
+		string snp_alt = s[i].alt;
+        int result_length = (int)result.length();
+        if(snp_pos > result_length || snp_end > result_length){
+            result = sequence;
+            transform(result.begin(), result.end(), result.begin(), ::toupper);
+            return result;
+        }
+        result = result.substr(0, snp_pos) + s[i].alt + result.substr(snp_end, result_length-snp_end);
+    }
+	transform(result.begin(), result.end(), result.begin(), ::toupper);
+	return result;
+// protected
+bool VCF::CheckVariantOverlap(vector<SNP> snp_list){
+    if (snp_list.size() <= 1) return false;
+    int previous_ends = -1;
+    for(int i = 0; i < snp_list.size(); i++){
+        if(snp_list[i].pos < previous_ends) return true;
+        if( previous_ends < snp_list[i].pos + (int)snp_list[i].ref.length()){
+            previous_ends = snp_list[i].pos + (int)snp_list[i].ref.length();
+        }
+    }
+    return false;
+void f(){
+    this_thread::sleep_for(chrono::seconds(2));
+    cout << "Hello World" << endl;
+// protected
+bool VCF::CheckTandemRepeat(string sequence, int unit_threshold) {
+	int sequence_length = (int)sequence.length();
+	//cout << sequence_length << "," << unit_threshold << endl;
+    if(sequence_length == 1) return true;
+	transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+    int end_index = sequence_length / 2 + 1;
+	bool final_checking = false;
+    int repeat_threshold = min(end_index-1, unit_threshold);
+	for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+		bool is_tandem_repeat = true;
+        int repeat_time = 1;
+		string repeat_region = sequence.substr(0, repeat_length);
+		int start_position = repeat_length;
+		while (start_position < sequence_length) {
+			if (start_position + repeat_length > sequence_length)
+				break;
+			string matching_region = sequence.substr(start_position, repeat_length);
+			if (matching_region != repeat_region) {
+				is_tandem_repeat = false;
+				break;
+			}
+			start_position += repeat_length;
+            repeat_time ++;
+		}
+		if (is_tandem_repeat && repeat_time > 1) {
+            final_checking = true;
+			break;
+		}
+    }
+	return final_checking;
+	clustering snps
+	algorithm description, please refer to paper method
+// protected
+void VCF::ClusteringSnps() {
+    // handle heterozygous snps
+	for (int i = 0; i < refpos_2_snp.size(); i++) {
+		auto & m = refpos_2_snp[i];
+		for (auto it = m.begin(); it != m.end(); ++it) {
+			auto & v = it->second;
+			for (int k = 0; k < v.size(); k++) {
+				if (v[k].flag != 1) {
+					v[k].flag = 1;
+				}
+				data_list.push_back(v[k]);
+			}
+		}
+	}
+	for (int i = 0; i < querypos_2_snp.size(); i++) {
+		auto & m = querypos_2_snp[i];
+		for (auto it = m.begin(); it != m.end(); ++it) {
+			auto & v = it->second;
+			for (int k = 0; k < v.size(); k++) {
+				v[k].flag = -1;
+				data_list.push_back(v[k]);
+			}
+		}
+	}
+	if (data_list.size() == 0)
+		return;
+	sort(data_list.begin(), data_list.end());
+	int cluster_index = 0;
+	int ins_ref = 0;
+	int del_ref = 0;
+	int ins_que = 0;
+	int del_que = 0;
+	int c_start = 0;
+    int c_end = 0;
+    for (int i = 0; i < data_list.size(); i++) {
+        auto snp = data_list[i];
+		// check if need to separator clusters
+        if (i > 0) {
+			c_end = snp.pos;
+            if(c_end-c_start >= 2){
+                string separator = genome_sequence.substr(c_start, c_end - c_start);
+                int max_change = max(ins_ref + del_que, ins_que + del_ref);
+                if ((int)(separator.length()) > 2 * max_change &&
+                    ((int)(separator.length()) > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+                {
+                    cluster_index++;
+                    ins_ref = 0;
+                    del_ref = 0;
+                    ins_que = 0;
+                    del_que = 0;
+                    c_start = 0; // re-assign c_start
+                }
+            }
+		}
+        if(c_start < snp.pos + (int)(snp.ref.length())) c_start = snp.pos + (int)(snp.ref.length());
+        // assign snp to cluster
+		cluster_snps_map[cluster_index].push_back(snp);
+		int ref_length = (int)(snp.ref.length());
+		int alt_length = (int)(snp.alt.length());
+		int diff_length = abs(ref_length - alt_length);
+		if (snp.flag == 1) {
+			if (snp.snp_type == 'I') {
+				ins_ref += diff_length;
+			}
+			else if (snp.snp_type == 'D') {
+				del_ref += diff_length;
+			}
+		}
+		else {
+			if (snp.snp_type == 'I') {
+				ins_que += diff_length;
+			}
+			else if (snp.snp_type == 'D') {
+				del_que += diff_length;
+			}
+		}
+	}
+// protected
+bool VCF::MatchSnpListsWithWeight(vector<SNP> & ref_snp_list,
+	vector<SNP> & query_snp_list,
+	vector<SNP> & mixed_list,
+	const string subsequence,
+	int offset,
+	int thread_index)
+	// handle heterozygous snps
+	map<string, vector<SNP> > ref_choice_snps;
+	sort(mixed_list.begin(), mixed_list.end());
+	for (int i = ref_snp_list.size(); i >= 1; i--) {
+		vector<vector<SNP> > combinations = CreateCombinations(ref_snp_list, i);
+		for (int k = 0; k < combinations.size(); k++) {
+			auto c = combinations[k];
+			if (CheckVariantOverlap(c)) continue;
+			string ref_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+			ref_choice_snps[ref_sequence] = c;
+		}
+	}
+	string best_match;
+	int best_score = 0;
+	vector<SNP> best_ref_variants;
+	vector<SNP> best_alt_variants;
+	for (int i = query_snp_list.size(); i >= 1; i--) {
+		vector<vector<SNP> > combinations = CreateCombinations(query_snp_list, i);
+		for (int k = 0; k < combinations.size(); k++) {
+			auto c = combinations[k];
+			if (CheckVariantOverlap(c)) continue;
+			string que_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+			if (ref_choice_snps.find(que_sequence) != ref_choice_snps.end()) {
+				return true;
+			}
+		}
+	}
+	return false;
+// protected
+bool VCF::MatchSnpLists(vector<SNP> & ref_snp_list,
+                        vector<SNP> & query_snp_list,
+                        vector<SNP> & mixed_list,
+                        const string subsequence,
+                        int offset,
+                        int thread_index)
+    // handle heterozygous snps
+	map<string, vector<SNP> > ref_choice_snps;
+	sort(mixed_list.begin(), mixed_list.end());
+	for (int i = ref_snp_list.size(); i >= 1; i--) {
+		vector<vector<SNP> > combinations = CreateCombinations(ref_snp_list, i);
+        for (int k = 0; k < combinations.size(); k++) {
+			auto c = combinations[k];
+			if(CheckVariantOverlap(c)) continue;
+            string ref_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+            ref_choice_snps[ref_sequence] = c;
+		}
+	}
+	for (int i = query_snp_list.size(); i >= 1; i--) {
+		vector<vector<SNP> > combinations = CreateCombinations(query_snp_list, i);
+		for (int k = 0; k < combinations.size(); k++) {
+			auto c = combinations[k];
+			if(CheckVariantOverlap(c)) continue;
+            string que_sequence = ModifySequenceBySnpList(subsequence, c, offset);
+			if (ref_choice_snps.find(que_sequence) != ref_choice_snps.end()) {
+				// delete all matched
+                auto r = ref_choice_snps[que_sequence];
+				sort(r.begin(), r.end());
+                string matching_result = "";
+                matching_result += chromosome_name;
+                string parsimonious_ref = subsequence;
+                string parsimonious_alt = que_sequence;
+                if(parsimonious_ref == parsimonious_alt){
+                    dout << "[Error] in variant, ref == alt";
+                }
+                int min_parsimonious_len = min(parsimonious_ref.size(), parsimonious_alt.size());
+				// normalize
+				int left_index = offset;
+				if (toupper(genome_sequence[left_index]) != toupper(subsequence[0])) {
+					dout << "[Error] genome sequence, subsequence, offset does not match." << endl;
+				}
+				bool change_in_allels = true;
+				while (change_in_allels) {
+					change_in_allels = false;
+					if (toupper(parsimonious_ref.back()) == toupper(parsimonious_alt.back())) {
+						if ((parsimonious_ref.size() > 1 && parsimonious_alt.back() > 1) || left_index > 0) {
+							parsimonious_ref.pop_back();
+							parsimonious_alt.pop_back();
+							change_in_allels = true;
+						}
+						else {
+							return false;
+						}
+					}
+					if (parsimonious_ref.length() == 0 || parsimonious_alt.length() == 0) {
+						left_index--;
+						char left_char = genome_sequence[left_index];
+						parsimonious_ref = left_char + parsimonious_ref;
+						parsimonious_alt = left_char + parsimonious_alt;
+					}
+				}
+				while (toupper(parsimonious_ref[0]) == toupper(parsimonious_alt[0]) && parsimonious_ref.size() > 1 && parsimonious_alt.size() > 1) {
+					parsimonious_ref.erase(0, 1);
+					parsimonious_alt.erase(0, 1);
+				}
+                matching_result += "\t" + parsimonious_ref + "\t" + parsimonious_alt;
+                string ref_matching_variants = "";
+				for (int m = 0; m < r.size(); m++) {
+					SNP r_snp = r[m];
+					for (auto n = mixed_list.begin(); n != mixed_list.end(); n++) {
+						SNP m_snp = *n;
+						if (m_snp.pos == r_snp.pos &&
+                            m_snp.ref == r_snp.ref &&
+                            m_snp.alt == r_snp.alt &&
+                            m_snp.flag == r_snp.flag)
+                        {
+							mixed_list.erase(n);
+                            break;
+                        }
+					}
+                    for (auto n = ref_snp_list.begin(); n != ref_snp_list.end(); n++){
+                        SNP m_snp = *n;
+						if (m_snp.pos == r_snp.pos &&
+                            m_snp.ref == r_snp.ref &&
+                            m_snp.alt == r_snp.alt &&
+                            m_snp.flag == r_snp.flag)
+                        {
+                        	// 1-based
+                            ref_matching_variants += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+							ref_snp_list.erase(n);
+                            break;
+                        }
+                    }
+				}
+                matching_result += "\t" + ref_matching_variants;
+                string que_matching_variants = "";
+				sort(c.begin(), c.end());
+				for (int m = 0; m < c.size(); m++) {
+					SNP q_snp = c[m];
+					for (auto n = mixed_list.begin(); n != mixed_list.end(); n++) {
+						SNP m_snp = *n;
+						if (m_snp.pos == q_snp.pos &&
+                            m_snp.ref == q_snp.ref &&
+                            m_snp.alt == q_snp.alt &&
+                            m_snp.flag == q_snp.flag)
+                        {
+							mixed_list.erase(n);
+                            break;
+                        }
+					}
+                    for (auto n = query_snp_list.begin(); n != query_snp_list.end(); n++){
+                        SNP m_snp = *n;
+						if (m_snp.pos == q_snp.pos &&
+                            m_snp.ref == q_snp.ref &&
+                            m_snp.alt == q_snp.alt &&
+                            m_snp.flag == q_snp.flag)
+                        {
+                        	// 1-based
+                            que_matching_variants += to_string(m_snp.pos+1) + "," + m_snp.ref + "," + m_snp.alt + ";";
+							query_snp_list.erase(n);
+                            break;
+                        }
+                    }
+				}
+                matching_result += "\t" + que_matching_variants + "\n";
+                complex_match_records[thread_index]->push_back(matching_result);
+                return true;
+			}
+		}
+	}
+	return false;
+// private
+void VCF::ClusteringSearchInThread(int start, int end, int thread_index) {
+    for (int cluster_id = start; cluster_id < end; cluster_id++) {
+		if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+            auto & snp_list = cluster_snps_map[cluster_id];
+            vector<SNP> candidate_ref_snps;
+			vector<SNP> candidate_que_snps;
+			vector<SNP> candidate_snps;
+            int min_pos = std::numeric_limits<int>::max();
+			int max_pos = 0;
+			for (int i = 0; i < snp_list.size(); i++) {
+				auto s = snp_list[i];
+				if (s.flag == 1) {
+					candidate_ref_snps.push_back(s);
+				}
+				else if(s.flag == -1) {
+					candidate_que_snps.push_back(s);
+				}
+                candidate_snps.push_back(s);
+				if (min_pos > s.pos) min_pos = s.pos;
+				if (max_pos < s.pos + (int)(s.ref.length())) max_pos = s.pos + (int)(s.ref.length());
+			}
+			min_pos = max(0, min_pos - 1);
+			max_pos = min(max_pos + 1, (int)genome_sequence.length());
+            string subsequence = genome_sequence.substr(min_pos, max_pos-min_pos);
+            if (candidate_ref_snps.size() == 0 || candidate_que_snps.size() == 0) continue;
+			if (candidate_ref_snps.size() <= 1 && candidate_que_snps.size() <= 1) continue;
+            if(candidate_ref_snps.size() > 10 || candidate_que_snps.size() > 10){
+                vector<SNP> cluster_ref_snps;
+                vector<SNP> cluster_que_snps;
+                int ins_ref = 0;
+                int del_ref = 0;
+                int ins_que = 0;
+                int del_que = 0;
+                int c_start = std::numeric_limits<int>::max();
+                int c_end = std::numeric_limits<int>::max();
+                for(int i = 0; i < candidate_snps.size(); i++){
+                    candidate_snps[i].pos += (int)candidate_snps[i].ref.length();
+                }
+                sort(candidate_snps.begin(), candidate_snps.end());
+                for (int i = candidate_snps.size()-1; i >= 0; i--) {
+                    auto snp = candidate_snps[i];
+                    // check if need to separator clusters
+                    if (i < candidate_snps.size() - 1) {
+                        int c_start = snp.pos;
+                        if(c_start < c_end){
+                            string separator = genome_sequence.substr(c_start, c_end - c_start);
+                            int max_change = max(ins_ref + del_que, ins_que + del_ref);
+                            if ((int)separator.length() > 2 * max_change && !CheckTandemRepeat(separator, max_change))
+                            {
+                                while(cluster_ref_snps.size() > 0 &&
+                                        cluster_que_snps.size() > 0 &&
+                                        MatchSnpLists(cluster_ref_snps, cluster_que_snps, snp_list, subsequence, min_pos, thread_index));
+                                cluster_ref_snps.clear();
+                                cluster_que_snps.clear();
+                                ins_ref = 0;
+                                del_ref = 0;
+                                ins_que = 0;
+                                del_que = 0;
+                            }
+                        }
+                    }
+                    if(c_end > snp.pos- (int)snp.ref.length()) c_end = snp.pos - (int)snp.ref.length();
+                    // assign snp to cluster
+                    snp.pos -= (int)snp.ref.length();
+                    if(snp.flag == 1){
+                        cluster_ref_snps.push_back(snp);
+                    }else{
+                        cluster_que_snps.push_back(snp);
+                    }
+                    int ref_length = (int)snp.ref.length();
+                    int alt_length = (int)snp.alt.length();
+                    int diff_length = abs(ref_length - alt_length);
+                    if (snp.flag == 1) {
+                        if (snp.snp_type == 'I') {
+                            ins_ref += diff_length;
+                        }
+                        else if (snp.snp_type == 'D') {
+                            del_ref += diff_length;
+                        }
+                    }
+                    else {
+                        if (snp.snp_type == 'I') {
+                            ins_que += diff_length;
+                        }
+                        else if (snp.snp_type == 'D') {
+                            del_que += diff_length;
+                        }
+                    }
+                }
+                //if separating cluster does not work, try heuristic, if still not work, discard this cluster
+                if(cluster_ref_snps.size() > 20 || cluster_que_snps.size() > 20){
+                    // final check by variant length, if not applicable, skip it and give a warning.
+                    if (cluster_ref_snps.size() > cluster_que_snps.size()){
+                        int ref_sum_del_len = 0;
+                        int ref_sum_ins_len = 0;
+                        for(int j = 0; j < cluster_ref_snps.size(); j++){
+                            int len_change = cluster_ref_snps[j].ref.size() -  cluster_ref_snps[j].alt.size();
+                            if (len_change > 0){
+                                ref_sum_del_len += len_change;
+                            }else if(len_change < 0){
+                                ref_sum_ins_len -= len_change;
+                            }
+                        }
+                        bool skip_flag = false;
+                        for(int j = 0; j < cluster_que_snps.size(); j++){
+                            int len_change = cluster_que_snps[j].ref.size() - cluster_que_snps[j].alt.size();
+                            if(len_change > 0){
+                                if (ref_sum_del_len < len_change){
+                                    skip_flag = true;
+                                    break;
+                                }
+                            }else if(len_change < 0){
+                                if (ref_sum_ins_len < len_change * -1){
+                                    skip_flag = true;
+                                    break;
+                                }
+                            }
+                        }
+                        if (skip_flag) continue;
+                    }else{
+                        int que_sum_del_len = 0;
+                        int que_sum_ins_len = 0;
+                        for(int j = 0; j < cluster_que_snps.size(); j++){
+                            int len_change = cluster_que_snps[j].ref.size() -  cluster_que_snps[j].alt.size();
+                            if (len_change > 0){
+                                que_sum_del_len += len_change;
+                            }else if(len_change < 0){
+                                que_sum_ins_len -= len_change;
+                            }
+                        }
+                        bool skip_flag = false;
+                        for(int j = 0; j < cluster_ref_snps.size(); j++){
+                            int len_change = cluster_ref_snps[j].ref.size() - cluster_ref_snps[j].alt.size();
+                            if(len_change > 0){
+                                if (que_sum_del_len < len_change){
+                                    skip_flag = true;
+                                    break;
+                                }
+                            }else if(len_change < 0){
+                                if (que_sum_ins_len < len_change * -1){
+                                    skip_flag = true;
+                                    break;
+                                }
+                            }
+                        }
+                        if(skip_flag) continue;
+                    }
+                    cout << "[Warning] large cluster found, skip it." << endl;
+                    continue;
+                }
+                while(cluster_ref_snps.size() > 0 &&
+                        cluster_que_snps.size() > 0 &&
+                        MatchSnpLists(cluster_ref_snps, cluster_que_snps, snp_list, subsequence, min_pos, thread_index));
+            }
+            else
+            {
+                while(candidate_ref_snps.size() > 0 &&
+                        candidate_que_snps.size() > 0 &&
+                        MatchSnpLists(candidate_ref_snps, candidate_que_snps, snp_list, subsequence, min_pos, thread_index));
+            }
+		}
+		else {
+			break;
+		}
+	}
+// match by cluster
+// private
+void VCF::ClusteringSearchMultiThread() {
+	clustering_search = true;
+	int start = cluster_snps_map.begin()->first;
+	int cluster_number = cluster_snps_map.size();
+	int cluster_end_boundary = start + cluster_number;
+	int cluster_step = cluster_number / thread_num;
+	if (cluster_step * thread_num < cluster_number) cluster_step++;
+	int end = start + cluster_step;
+    //initialize vector size, each allocating will have a lock
+    complex_match_records = new vector<string>* [thread_num];
+    for(int j = 0; j < thread_num; j++){
+        complex_match_records[j] = new vector<string>;
+    }
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+        int variant_number = 0;
+        for (int cluster_id = start; cluster_id < end; cluster_id++) {
+		    if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+                variant_number += cluster_snps_map[cluster_id].size();
+            }
+        }
+		threads.push_back(thread(&VCF::ClusteringSearchInThread, this, start, end, i));
+		start = end;
+		end = start + cluster_step;
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	if (start >= cluster_snps_map.size()) {
+		dout << "[Error] index out of map range" << endl;
+	}
+	else {
+        int variant_number = 0;
+        for (int cluster_id = start; cluster_id < end; cluster_id++) {
+		    if (cluster_snps_map.find(cluster_id) != cluster_snps_map.end()) {
+                variant_number += cluster_snps_map[cluster_id].size();
+            }
+        }
+		ClusteringSearchInThread(start, end, i);
+	}
+	// call join() on each thread in turn before this function?
+	std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    ofstream output_complex_file;
+    output_complex_file.open(output_complex_filename);
+    output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+    output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+    output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2" << endl;
+    for(int i = 0; i < thread_num; i++){
+        for (int j = 0; j < complex_match_records[i]->size(); j++){
+            if(complex_match_records[i]->at(j).find_first_not_of(' ') != std::string::npos){
+                output_complex_file << complex_match_records[i]->at(j);
+            }
+        }
+    }
+    output_complex_file.close();
+    for(int j = 0; j < thread_num; j++){
+        delete complex_match_records[j];
+    }
+    delete [] complex_match_records;
+// private
+int VCF::GetRefSnpNumber(int & indel_num) {
+	int result = 0;
+	indel_num = 0;
+	if (clustering_search) {
+		for (auto it = cluster_snps_map.begin(); it != cluster_snps_map.end(); it++) {
+			auto v = it->second;
+			for (int i = 0; i < v.size(); i++) {
+				if (v[i].flag == 1) {
+					result++;
+					if (v[i].ref.length() != v[i].alt.length())
+						indel_num++;
+				}
+			}
+		}
+	}else{
+	    for (int i = 0; i < refpos_2_snp.size(); i++) {
+			for (auto it = refpos_2_snp[i].begin(); it != refpos_2_snp[i].end(); ++it) {
+				auto v = it->second;
+				result += v.size();
+				for (int k = 0; k < v.size(); k++) {
+					if (v[k].ref.length() != v[k].alt.length())
+						indel_num++;
+				}
+			}
+	    }
+    }
+	return result;
+// private
+int VCF::GetQuerySnpNumber(int & indel_num) {
+	int result = 0;
+	indel_num = 0;
+	if (clustering_search) {
+		for (auto it = cluster_snps_map.begin(); it != cluster_snps_map.end(); it++) {
+			auto v = it->second;
+			for (int i = 0; i < v.size(); i++) {
+				if (v[i].flag == -1) {
+					result++;
+					if (v[i].ref.length() != v[i].alt.length())
+						indel_num++;
+				}
+			}
+		}
+	}else{
+	    for (int i = 0; i < querypos_2_snp.size(); i++) {
+			for (auto it = querypos_2_snp[i].begin(); it != querypos_2_snp[i].end(); ++it) {
+				auto v = it->second;
+				result += v.size();
+				for (int k = 0; k < v.size(); k++) {
+					if (v[k].ref.length() != v[k].alt.length())
+						indel_num++;
+				}
+			}
+	    }
+    }
+	return result;
+// public
+void VCF::Compare(string ref_vcf,
+        string query_vcf,
+        string genome_seq,
+        bool direct_search,
+        string output_prefix,
+        bool match_genotype,
+		bool normalization){
+    ref_vcf_filename = ref_vcf;
+    que_vcf_filename = query_vcf;
+    this->match_genotype = match_genotype;
+	this->normalization = normalization;
+	output_stat_filename = output_prefix + ".stat";
+    output_simple_filename = output_prefix + ".simple";
+    output_complex_filename = output_prefix + ".complex";
+    //------------read genome sequence and decide boundary according to thread number
+	dsptime();
+	dout << " Read genome sequence file... " << endl;
+	ReadGenomeSequence(genome_seq);
+	dsptime();
+	dout << " Finish reading genome sequence file." << endl;
+	//------------read ref and query vcf file
+	dsptime();
+	dout << " Read reference vcf file... " << endl;
+	ReadRefVCF(ref_vcf);
+	dsptime();
+	dout << " Read query vcf file... " << endl;
+	ReadQueryVCF(query_vcf);
+	dsptime();
+	dout << " Finish reading all vcf file." << endl;
+	//------------check vcf entry number before matching
+    int ref_total_indel_num, que_total_indel_num;
+	int ref_total_num = GetRefSnpNumber(ref_total_indel_num);
+    int que_total_num = GetQuerySnpNumber(que_total_indel_num);
+    dout << " referece vcf entry number [total, indel]: " << ref_total_num << "," << ref_total_indel_num << endl;
+	dout << " query vcf entry number: [total, indel] " << que_total_num << "," << que_total_indel_num << endl;
+	//------------direct search
+	dsptime();
+	dout << " Direct search ... " << endl;
+	DirectSearchMultiThread();
+	dsptime();
+	dout << " Finish direct search." << endl;
+    int ref_direct_left_indel_num, que_direct_left_indel_num;
+    int ref_direct_left_num = GetRefSnpNumber(ref_direct_left_indel_num);
+    int que_direct_left_num = GetQuerySnpNumber(que_direct_left_indel_num);
+    int ref_direct_match_num = ref_total_num - ref_direct_left_num;
+    int que_direct_match_num = que_total_num - que_direct_left_num;
+    int ref_direct_match_indel_num = ref_total_indel_num - ref_direct_left_indel_num;
+    int que_direct_match_indel_num = que_total_indel_num - que_direct_left_indel_num;
+	dout << " referece vcf entry direct match number [total, indel]: " << ref_direct_match_num << "," << ref_direct_match_indel_num << endl;
+	dout << " query vcf entry direct match number [total, indel]: " << que_direct_match_num  << "," << que_direct_match_indel_num << endl;
+	if (direct_search){
+	    dout << " referece vcf entry mismatch number [total, indel]: " << ref_direct_left_num << "," << ref_direct_left_indel_num << endl;
+	    dout << " query vcf entry mismatch number [total, indel]: " << que_direct_left_num << "," << que_direct_left_indel_num << endl;
+        ofstream output_stat_file;
+        output_stat_file.open(output_stat_filename);
+        output_stat_file << ref_total_num << endl;
+        output_stat_file << que_total_num << endl;
+        output_stat_file << ref_direct_match_num << endl;
+        output_stat_file << que_direct_match_num << endl;
+        output_stat_file << ref_direct_left_num << endl;
+        output_stat_file << que_direct_left_num << endl;
+        //=====================================================
+        output_stat_file << ref_total_indel_num << endl;
+		output_stat_file << que_total_indel_num << endl;
+		output_stat_file << ref_direct_match_indel_num << endl;
+		output_stat_file << que_direct_match_indel_num << endl;
+		output_stat_file << ref_direct_left_indel_num << endl;
+		output_stat_file << que_direct_left_indel_num << endl;
+        output_stat_file.close();
+        return;
+    }
+	//-------------clustering search
+	dsptime();
+	dout << " Clustering snps ... " << endl;
+	ClusteringSnps();
+	dsptime();
+	dout << " Finish clustering." << endl;
+	dsptime();
+	dout << " Clustering search ... " << endl;
+	ClusteringSearchMultiThread();
+	dsptime();
+	dout << " Finish clustering search." << endl;
+    int ref_cluster_left_indel_num, que_cluster_left_indel_num;
+	int ref_cluster_left_num = GetRefSnpNumber(ref_cluster_left_indel_num);
+    int que_cluster_left_num = GetQuerySnpNumber(que_cluster_left_indel_num);
+    int ref_cluster_match_num = ref_direct_left_num - ref_cluster_left_num;
+    int que_cluster_match_num = que_direct_left_num - que_cluster_left_num;
+    int ref_cluster_match_indel_num = ref_direct_left_indel_num - ref_cluster_left_indel_num;
+    int que_cluster_match_indel_num = que_direct_left_indel_num - que_cluster_left_indel_num;
+    dout << " referece vcf entry cluster match number [total, indel]: " << ref_cluster_match_num << "," << ref_cluster_match_indel_num << endl;
+	dout << " query vcf entry cluster match number [total, indel]: " << que_cluster_match_num << "," << que_cluster_match_indel_num << endl;
+	dout << " referece vcf entry mismatch number [total, indel]: " << ref_cluster_left_num << "," << ref_cluster_left_indel_num << endl;
+	dout << " query vcf entry mismatch number [total, indel]: " << que_cluster_left_num  << "," << que_cluster_left_indel_num << endl;
+    //write stat file
+    ofstream output_stat_file;
+    output_stat_file.open(output_stat_filename);
+    output_stat_file << ref_total_num << endl;
+    output_stat_file << que_total_num << endl;
+    output_stat_file << ref_direct_match_num << endl;
+    output_stat_file << que_direct_match_num << endl;
+    output_stat_file << ref_cluster_match_num << endl;
+    output_stat_file << que_cluster_match_num << endl;
+    output_stat_file << ref_cluster_left_num << endl;
+    output_stat_file << que_cluster_left_num << endl;
+	//=====================================================
+	output_stat_file << ref_total_indel_num << endl;
+	output_stat_file << que_total_indel_num << endl;
+	output_stat_file << ref_direct_match_indel_num << endl;
+	output_stat_file << que_direct_match_indel_num << endl;
+	output_stat_file << ref_cluster_match_indel_num << endl;
+	output_stat_file << que_cluster_match_indel_num << endl;
+	output_stat_file << ref_direct_left_indel_num << endl;
+	output_stat_file << que_direct_left_indel_num << endl;
+    output_stat_file.close();
+    return;
diff --git a/src/vcf.h b/src/vcf.h
new file mode 100644
index 0000000..77d2754
--- /dev/null
+++ b/src/vcf.h
@@ -0,0 +1,210 @@
+#pragma once // the same purpose as #include guards
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+#include "util.h"
+#include <iostream>
+using namespace std;
+typedef struct SNP {
+    SNP(int pos_ = 0,
+        char snp_type_ = 'S',
+        string ref_ = "",
+        string alt_ = "",
+        string genotype_ = "1/1",
+        int flag_=1) :
+		pos(pos_),
+        snp_type(snp_type_),
+        ref(ref_),
+        alt(alt_),
+        genotype(genotype_),
+        flag(flag_){}
+	int pos;
+	char snp_type;
+	string ref;
+	string alt;
+    string genotype;
+    int flag;
+// define outside of struct, idiomatic solution for lexicographical compare for structures
+bool operator <(const SNP& x, const SNP& y);
+bool operator ==(const SNP& x, const SNP& y);
+typedef vector<unordered_map<int, vector<SNP> > > SnpHash;
+typedef vector<map<int, vector<SNP> > > SnpMap;
+class VCF
+    int debug_f;
+    bool complex_search;
+    void ReadVCF(string filename, SnpHash & pos_2_snps);
+	void DirectSearchInThread(unordered_map<int, vector<SNP> > & ref_snps,
+							unordered_map<int, vector<SNP> > & query_snps,
+                            int thread_index);
+	//template function can only be defined in head file
+	template <typename T>
+	vector<vector<T>> CreateCombinationsWithTarget(vector<T> dict, int k, vector<int> changes, int target) {
+	//vector<vector<T>> CreateCombinations(vector<T> dict, int k) {
+		vector<vector<T>> result;
+		int n = dict.size();
+		vector<bool> v(n);
+		fill(v.begin(), v.end() - n + k, true);
+		do {
+			vector<T> t;
+            int sum = 0;
+			for (int i = 0; i < n; ++i) {
+				if (v[i]){
+                    t.push_back(dict[i]);
+                    sum += changes[i];
+                }
+			}
+            if(sum == target){
+			    result.push_back(t);
+            }
+		} while (prev_permutation(v.begin(), v.end()));
+		return result;
+	}
+    void ClusteringSearchInThread(int start, int end, int thread_index);
+    //-------------------------following can be public--------------------------
+    // but for a better OO design, made them private
+    string ref_mismatch_filename;
+    string que_mismatch_filename;
+	// data structure for direct search
+	SnpHash refpos_2_snp;
+	SnpHash querypos_2_snp;
+	// data structure for complex search
+	SnpMap refpos_snp_map;
+	SnpMap querypos_snp_map;
+	void ReadRefVCF(string filename);
+	void ReadQueryVCF(string filename);
+	void DirectSearchMultiThread();
+	virtual void ClusteringSnps();
+	// default value better be in declaration, or definition, but never both
+	void ClusteringSearchMultiThread();
+	int GetRefSnpNumber(int & indel_num);
+	int GetQuerySnpNumber(int & indel_num);
+	//---------------------------above can be public:---------------------------
+    vector<int> pos_boundries; // boundries for split multi hash table
+    bool boundries_decided; // before deciding boundries, can not read vcf file, because do not know how to split
+	bool normalization;
+    // for inherit
+    bool match_genotype;
+	bool clustering_search;
+	int thread_num;
+    string chromosome_name;
+	string genome_sequence; // genome sequence from fasta file
+    const static int MAX_REPEAT_LEN = 1000;
+    const static int GENOTYPE_COLUMN_NUM = 10;
+    // data structure for clustering search
+    vector<SNP> data_list;
+    vector<int> cluster_list;
+    map<int, vector<SNP> > cluster_snps_map;
+	vector<unordered_map<int, int> > haplotype_matching_check; //vector is for multi-thread,
+    // storing complex match results
+    //std::mutex complex_match_mutex;
+    //vector<vector<string>> complex_match_records;
+    //lock free dynamic array of vector pointers for storing.
+    vector<string> ** direct_match_records;
+    vector<string> ** complex_match_records;
+    // for output
+    string ref_vcf_filename;
+    string que_vcf_filename;
+    string output_stat_filename;
+    string output_simple_filename;
+    string output_complex_filename;
+	bool CompareSnps(SNP r, SNP q);
+    virtual void DecideBoundaries();
+	string ModifySequenceBySnp(const string sequence, SNP s, int offset);
+	string ModifySequenceBySnpList(const string sequence, vector<SNP> s, int offset);
+	bool CheckTandemRepeat(string sequence, int unit_threshold);
+	void ReadGenomeSequence(string filename);
+    bool MatchSnpLists(vector<SNP> & ref_snp_list,
+            vector<SNP> & query_snp_list,
+            vector<SNP> & mixed_list,
+            const string subsequence,
+            int offset,
+            int thread_index);
+    template <typename D>
+	vector<vector<D>> CreateCombinations(vector<D> dict, int k) {
+		vector<vector<D>> result;
+		int n = dict.size();
+		vector<bool> v(n);
+		fill(v.begin(), v.end() - n + k, true);
+		do {
+			vector<D> t;
+			int sum = 0;
+			for (int i = 0; i < n; ++i) {
+				if (v[i]) {
+					t.push_back(dict[i]);
+				}
+			}
+			result.push_back(t);
+		} while (prev_permutation(v.begin(), v.end()));
+		return result;
+	}
+	bool MatchSnpListsWithWeight(vector<SNP> & ref_snp_list,
+		vector<SNP> & query_snp_list,
+		vector<SNP> & mixed_list,
+		const string subsequence,
+		int offset,
+		int thread_index);
+    bool CheckVariantOverlap(vector<SNP> snp_list);
+	bool NormalizeSnp(int pos, string ref, string alt, string & parsimonious_ref, string & parsimonious_alt);
+	inline void ToUpper(string & s){
+        transform(s.begin(), s.end(), s.begin(), ::toupper);
+    }
+	VCF(int thread_num_ = 0);
+	~VCF();
+    static bool static_match_genotype; //global variable
+    // for public access
+	void Compare(string ref_vcf,
+            string query_vcf,
+            string genome_seq,
+            bool direct_search,
+            string output_prefix,
+            bool match_genotype,
+			bool normalization);
diff --git a/src/vm.cpp b/src/vm.cpp
new file mode 100644
index 0000000..6176e0a
--- /dev/null
+++ b/src/vm.cpp
@@ -0,0 +1,233 @@
+// concurrent.cpp : Defines the entry point for the console application.
+//#include "stdafx.h"
+#include <iostream>
+#include <thread>
+#include <tclap/CmdLine.h>
+#include "wholegenome.h"
+using namespace std;
+typedef struct Args {
+	string ref_vcf_filename;
+	string que_vcf_filename;
+	string genome_seq_filename;
+	string output_dir;
+    string output_prefix;
+	int thread_num;
+	int score_unit;
+	int match_mode;
+	int score_scheme;
+    bool detail_results;
+    vector<string> query_file_list;
+    bool pr_curves;
+    bool direct_match;
+//	bool direct_search;
+//	string chr_name;
+//	string stat_filename;
+//	bool remove_duplicates;
+//	string single_vcf_filename;
+//	bool match_genotype;
+//	bool normalization;
+//	bool score_basepair;
+//	bool overlap_match;
+//	bool variant_check; // check if variant matches
+//	bool whole_genome;
+bool TclapParser(Args & args, int argc, char** argv){
+	string version = "0.9";
+	try {
+		std::string desc = "Please cite our paper if you are using this program in your research. \n";
+		TCLAP::CmdLine cmd(desc, ' ', version);
+		//TCLAP::ValueArg<std::string> arg_input_vcf_file("i", "i", "input VCF file", true, "", "file", cmd);
+		TCLAP::ValueArg<std::string> arg_genome_seq_filename("g", "genome_sequence", "genome sequence FASTA file", true, "", "file");
+		TCLAP::ValueArg<std::string> arg_baseline_vcf_filename("b", "baseline", "baseline variant VCF file", true, "", "file");
+		TCLAP::MultiArg<std::string> arg_query_vcf_filename("q", "query", "query variant VCF file list", true, "file list");
+		TCLAP::ValueArg<std::string> arg_output_dir("o", "output_dir", "output directory, default is current working directory", false, ".", "string");
+		TCLAP::ValueArg<std::string> arg_output_prefix("p", "file_prefix", "output filename prefix, default is \"out\"", false, "out", "string");
+        int thread_num = (int)thread::hardware_concurrency();
+		int max_cores = (int)thread::hardware_concurrency();
+		if(max_cores <= 0) max_cores = 1;
+		string thread_string = "number of threads, default is the number of available cores (For this machine: " + to_string(max_cores) + ").\n"
+                            "If larger than number of available cores or less than 1, automatically set to default value";
+		TCLAP::ValueArg<int> arg_thread_num("t", "thread_num", thread_string, false, thread_num, "int");
+		vector<int> allowed_two = {-1, 0,1};
+		TCLAP::ValuesConstraint<int> allowedVals(allowed_two);
+        string score_unit_string = "scoring function/score unit: (Default: -1)\n"
+        "-1 : iterate both 0 and 1.\n"
+        "0 : the score that a VCF entry contributes is 1.\n"
+        "1 : the score that a VCF entry contributes is the edit distance between the new allele and the reference one.\n";
+        TCLAP::ValueArg<int> arg_score_unit("u", "score_unit", score_unit_string, false, -1, &allowedVals);
+        string match_mode_string = "matching mode: (Default: -1)\n"
+        "-1 : iterate both 0 and 1.\n"
+        "0 : a set of query entries match a set of baseline entries if, "
+        "for each entry, we can select one of the alleles such that the inferred sequences are identical\n"
+        "1 : a set of query entries match a set of baseline entries if there exist a phasing of each set such that "
+        "the two inferred haplotypes from the query are equal to the two inferred haplotypes from the baseline.\n";
+        TCLAP::ValueArg<int> arg_match_mode("m", "match_mode", match_mode_string, false, -1, &allowedVals);
+        string score_scheme_string = "scoring scheme: (Default: -1)\n"
+        "-1 : iterate 0, 1, and 2 (not including 3)\n"
+        "0 : find two subsets of non-overlapping equivalent variants such that "
+        "the score of the matched variants is maximized (Default)\n"
+        "1 : find two subsets of non-overlapping equivalent variants such that"
+        " the score of the chosen baseline variants is maximized\n"
+        "2 : find a maximum scoring set of variants in the query such that"
+        " each variant can be matched by a subset of the baseline variants\n"
+        "3 : (1 to 1 direct match) find a maximum scoring set of entry pairs such that each entry pair contains"
+        " one query and one baseline variant that result in the same sequence. In this scheme, different scoring functions and "
+        "matching mode have no difference.\n";
+        vector<int> allowed_four = {-1,0,1,2,3};
+        TCLAP::ValuesConstraint<int> allowedFour(allowed_four);
+        TCLAP::ValueArg<int> arg_score_scheme("s", "score_scheme", score_scheme_string, false, -1, &allowedFour);
+        //string direct_match_string = "Direct Match. \n";
+        //TCLAP::SwitchArg arg_direct_match("d", "direct_match", direct_match_string, cmd, false);
+        string detail_results_string = "output detail matching results, by default do not output.\n"
+        "filename in format PREFIX.PARAMETER.match\n"
+        "The results present which variants in baseline match which variants in query.";
+        TCLAP::SwitchArg arg_detail_results("e","detail_results", detail_results_string, cmd, false);
+        string precision_recall_string = "Disable Precision-Recall curves. \n";
+        TCLAP::SwitchArg arg_disable_curves("C", "disable_curves", precision_recall_string, cmd, false);
+        cmd.add(arg_score_scheme);
+        cmd.add(arg_match_mode);
+        cmd.add(arg_score_unit);
+        cmd.add(arg_thread_num);
+        //cmd.add(arg_output_prefix);
+        cmd.add(arg_output_dir);
+        cmd.add(arg_query_vcf_filename);
+        cmd.add(arg_baseline_vcf_filename);
+        cmd.add(arg_genome_seq_filename);
+		cmd.parse(argc, argv);
+		args.genome_seq_filename = arg_genome_seq_filename.getValue();
+		args.ref_vcf_filename = arg_baseline_vcf_filename.getValue();
+		args.query_file_list = arg_query_vcf_filename.getValue();
+        //args.que_vcf_filename = arg_query_vcf_filename.getValue();
+        args.output_dir = arg_output_dir.getValue();
+		args.output_prefix = arg_output_prefix.getValue();
+		args.thread_num = arg_thread_num.getValue();
+		if(args.thread_num <= 0 || args.thread_num > max_cores) args.thread_num = max_cores;
+		args.score_unit = arg_score_unit.getValue();
+		args.match_mode = arg_match_mode.getValue();
+		args.score_scheme = arg_score_scheme.getValue();
+        args.detail_results = arg_detail_results.getValue();
+        args.pr_curves = ! arg_disable_curves.getValue();
+        //args.direct_match = arg_direct_match.getValue();
+	}
+	catch (TCLAP::ArgException &e)
+	{
+		std::cerr << "error: " << e.error() << " for arg " << e.argId() << "\n";
+		abort();
+	}
+	return true;
+int usage(char* command) {
+	cout << "\n";
+	cout << "\tPlease cite our paper if you are using this program in your research." << endl;
+    cout << endl;
+	cout << "Usage: " << endl;
+    cout << command << " -g genome file path(FASTA format)" << endl;
+    cout << "\t-r reference VCF file path" << endl;
+    cout << "\t-q query VCF file path" << endl;
+    cout << "\t-o output file prefix" << endl;
+    cout << "\t[-t thread number]" << endl;
+    cout << "\t[-n normalize VCF entries before comparing]" << endl;
+    cout << "\t[-m single VCF file to remove duplicates]" << endl;
+    cout << "\t[-G do not match genotype when match vcf records]" << endl;
+    cout << endl;
+	return 0;
+int main(int argc, char* argv[])
+//	dout << "Debug Mode" << endl;
+//    WholeGenome wg(1);
+//    wg.test();
+//    return 0;
+    Args args;
+    TclapParser(args, argc, argv);
+    //return 0;
+    WholeGenome wg(args.thread_num,
+                   args.output_dir,
+                   args.pr_curves);
+    // if(args.direct_match){
+    //     for(int i = 0; i < args.query_file_list.size(); i++){
+    //         string query_filename = args.query_file_list[i];
+    //         wg.DirectMatch(args.ref_vcf_filename, query_filename, args.match_mode);
+    //     }
+    //     return 0;
+    // }
+    wg.ReadRef(args.genome_seq_filename, 
+        args.ref_vcf_filename);
+    // use a loop 
+    for(int i = 0; i < args.query_file_list.size(); i++){
+        string query_filename = args.query_file_list[i];
+        wg.Compare(query_filename,
+            "query"+to_string(i+1),
+            args.detail_results,
+            args.score_unit,
+            args.match_mode,
+            args.score_scheme);
+    }
+    return 0;
+//    if(args.remove_duplicates){
+//        RemoveDuplicate rd(args.thread_num);
+//        rd.Deduplicate(args.single_vcf_filename,
+//            args.genome_seq_filename,
+//            args.direct_search,
+//            args.output_filename);
+//        return 0;
+//	}
+//	DiploidVCF dv(args.thread_num);
+//    dv.Compare(args.ref_vcf_filename,
+//		args.que_vcf_filename,
+//		args.genome_seq_filename,
+//		args.direct_search,
+//		args.output_filename,
+//		args.match_genotype,
+//		args.normalization,
+//		args.score_basepair,
+//		args.overlap_match,
+//		args.variant_check);
+//	return 0;
+//	VCF vcf(args.thread_num);
+//	vcf.Compare(args.ref_vcf_filename,
+//			args.que_vcf_filename,
+//			args.genome_seq_filename,
+//			args.direct_search,
+//            args.output_filename,
+//            args.match_genotype,
+//			args.normalization);
+    return 0;
diff --git a/src/wholegenome.cpp b/src/wholegenome.cpp
new file mode 100644
index 0000000..bb05500
--- /dev/null
+++ b/src/wholegenome.cpp
@@ -0,0 +1,3341 @@
+#include "wholegenome.h"
+using namespace std;
+// constructor
+WholeGenome::WholeGenome(int thread_num_,
+    string output_dir_,
+    bool pr_curves){
+    thread_num = thread_num_;
+    chrom_num = 24;
+    output_dir = output_dir_;
+    //thread_num = thread_num_;
+    //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+    ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+	for (int j = 0; j < chrom_num; j++) {
+		ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+	}
+    que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        que_variant_by_chrid[j] = new vector<DiploidVariant>;
+    }
+    // chr_id starts from 0
+	for(int j = 1; j <= 22; j++){
+        string chr_name = to_string(j);
+        chrname_dict[chr_name] = j-1;
+        chr_name = "chr"+chr_name;
+        chrname_dict[chr_name] = j-1;
+	}
+	chrname_dict["X"] = 22;
+	chrname_dict["chrX"] = 22;
+	chrname_dict["Y"] = 23;
+	chrname_dict["chrY"] = 23;
+    if(pr_curves){
+        per_list = {0.0, 0.1, 0.2, 0.3, 0.9};
+    }else{
+        per_list = {0.0};
+    }
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+    int result = 0;
+    result |= score_unit & 1;
+    result <<= 1;
+    result |= match_mode & 1;
+    result <<= 2;
+    result |= score_scheme & 3;
+    return result;
+// distructor
+    for(int j = 0; j < chrom_num; j++){
+        ref_variant_by_chrid[j]->clear();
+        delete ref_variant_by_chrid[j];
+        que_variant_by_chrid[j]->clear();
+        delete que_variant_by_chrid[j];
+    }
+    delete[] ref_variant_by_chrid;
+    delete[] que_variant_by_chrid;
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+    std::ifstream input(filename);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+        return false;
+    }
+    std::string line, name, content;
+    int real_chrom_num = 0;
+    int chr_id = 0;
+    int current_id = -1;
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                if(chrname_dict.find(name) == chrname_dict.end()){
+                    cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+                    return false;
+                }
+                //int chr_id = chrname_dict[name];
+                if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+                    chrid_by_chrname[name] = chr_id;
+                    chr_id++;
+                }
+                current_id = chrid_by_chrname[name];
+                chrname_by_chrid[current_id] = name;
+                genome_sequences[current_id] = content;
+                real_chrom_num++;
+                name.clear();
+            }
+            if( !line.empty() ){
+                name = split(line, ' ')[0].substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        if(chrname_dict.find(name) == chrname_dict.end()){
+            cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+            return false;
+        }
+        if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+            chrid_by_chrname[name] = chr_id;
+            chr_id++;
+        }
+        current_id = chrid_by_chrname[name];
+        chrname_by_chrid[current_id] = name;
+        genome_sequences[current_id] = content;
+        real_chrom_num++;
+    }
+    // test
+    chrom_num = real_chrom_num;
+    //dout << "detected chromosome num: " << chrom_num << endl;
+//    for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+//        cout << it->first << ":" << (it->second).length();
+//    }
+    return true;
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+    int total_num = 0;
+    int long_num = 0;
+    double QUAL_LOWER_BOUND = 0.1;
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return -1;
+	}
+    vector<float> quality_list;
+	int genotype_index = -1;
+	char genotype_separator = '/';
+	//int genome_sequence_length = genome_sequence.length();
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+		//dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		//if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#') {
+			continue;
+		}
+		auto columns = split(line, '\t');
+		if (columns.size() < 10) {
+			if(match_mode_indicator != 1){
+                cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+                cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+                match_mode_indicator = 1;
+                //continue;
+            }
+            if(columns.size() < 6){
+                cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+                cout << "[VarMatch] skip current variant: " << line << endl;
+                continue;
+            }
+		}
+		string chr_name = columns[0];
+		auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+		auto ref = columns[3];
+		auto alt_line = columns[4];
+		double quality = stod(columns[5]);
+        if(flag){
+            quality_list.push_back(quality);
+        }
+		ToUpper(ref);
+		ToUpper(alt_line);
+		bool is_heterozygous_variant = false;
+		bool is_multi_alternatives = false;
+        bool is_zero_one_var = false;
+        vector<string> genotype_columns;
+        if (match_mode_indicator != 1) { // match mode indicator is -1 or 0
+			if (genotype_index < 0) {
+                // change genotype index
+                auto formats = split(columns[8], ':');
+                for (int i = 0; i < formats.size(); i++) {
+                    if (formats[i] == "GT") {
+                        genotype_index = i;
+                        break;
+                    }
+                }
+                // if GT not found
+                if(genotype_index < 0){
+                    if(match_mode_indicator != 1 && match_mode_indicator != 1){
+                        cout << "[VarMatch] Warning: VCF entry does not contain genotype information." << endl;
+                        cout << "[VarMatch] \tAutomatically turn off genotype matching mode. " << endl;
+                        match_mode_indicator = 1;
+                    }
+                }
+			}
+            if(match_mode_indicator != 1){
+    			auto additionals = split(columns[9], ':');
+                genotype_columns = split(additionals[genotype_index], genotype_separator);
+                if(genotype_columns.size() != 2){
+                    if(genotype_separator == '/'){
+                        genotype_separator = '|';
+                    }else{
+                        genotype_separator = '/';
+                    }
+                    genotype_columns = split(additionals[genotype_index], genotype_separator);
+                }
+    			// normalize format of genotype: sorted, separated by |
+    			if (genotype_columns.size() != 2) {
+    				cout << "[VarMatch] Warning: Unrecognized Genotype: " << additionals[genotype_index] << endl;
+                    cout << "[VarMatch] \tAutomatically turn off genotype matching mode." << endl;
+                    match_mode_indicator = 1;
+    			}
+    			else {
+    				if (genotype_columns[0] != genotype_columns[1]) {
+    					is_heterozygous_variant = true;
+    				}
+                    if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+                        //cout << "Skip Variants when both genotype is refernce allele: " << line << endl;   
+                        continue;
+                    }
+                    if(genotype_columns[0] == "0" || genotype_columns[1] == "0"){
+                        is_zero_one_var = true;
+                    }
+    			}
+            }
+		}
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos) {
+			alt_list = split(alt_line, ',');
+			is_multi_alternatives = true;
+		}
+		else {
+			alt_list.push_back(alt_line);
+		}
+        if(alt_list.size() > 2){
+            if(match_mode_indicator != 1){
+            vector<string> temp_alt_list = alt_list;
+            alt_list.clear();
+            for(int i = 0; i < 2; i++){
+                int alt_indicator = stoi(genotype_columns[i]);
+                if(alt_indicator == 0) continue;
+                alt_list.push_back(temp_alt_list[alt_indicator-1]);
+            }
+            }else{
+                vector<string> temp_alt_list = alt_list;
+                alt_list.clear();
+                alt_list.push_back(temp_alt_list[0]);
+                alt_list.push_back(temp_alt_list[1]);
+            }
+        }
+        int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+        int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+        if(is_multi_alternatives){
+            snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+            snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+        }
+        if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+            //dout << "[VarMatch] skip large INDEL with length > " << VAR_LEN << "| "<< line <<endl;
+            long_num ++;
+            continue;
+        }
+		DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality, is_zero_one_var);
+		//if (normalization) {
+			//NormalizeDiploidVariant(dv);
+		//}
+        if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+            int chr_id = chrid_by_chrname[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }else{
+            cout << "[VarMatch] skip current variant as no corresponding reference genome sequence found." << endl;
+            continue;
+            int chr_id = chrname_dict[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }
+        total_num++;
+	}
+	vcf_file.close();
+    if(flag){
+        sort(quality_list.begin(), quality_list.end());
+        auto qual_lower_it = lower_bound(quality_list.begin(), quality_list.end(), QUAL_LOWER_BOUND);
+        int qual_lower_index = qual_lower_it - quality_list.begin();
+        int rest_size = quality_list.size() - qual_lower_index;
+        vector<float> temp_percentage_list;
+        temp_percentage_list.push_back(0.0);
+        threshold_list.push_back(0.0);
+        for(int i = 1; i < per_list.size(); i++){
+            int additional_index = (int)(rest_size * per_list[i]);
+            int real_index = qual_lower_index + additional_index;
+            if(real_index >= quality_list.size()) real_index = quality_list.size() - 1;
+            double quality = quality_list[real_index];
+            threshold_list.push_back(quality);
+            auto quality_lowit = lower_bound(quality_list.begin(), quality_list.end(), quality);
+            int quality_low_index = quality_lowit - quality_list.begin();
+            // following program will retain variants >= quality threshold
+            int quality_size = quality_low_index + 1; // counting number, +/- 1 does not matter
+            if(quality_size > quality_list.size()) quality_size = quality_list.size();
+            double percentage = (double)quality_size/ quality_list.size();
+            temp_percentage_list.push_back(percentage);
+        }
+        threshold_num = threshold_list.size();
+        // revice percentage
+        per_list = temp_percentage_list;
+    }
+    cout << flag << "," << total_num << "," << long_num << endl;
+	return total_num;
+bool WholeGenome::ReadVariantFileList(string filename){
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+    return EditDistance(dv.ref, dv.alts[allele_indicator]);
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+	const std::size_t len1 = s1.size(), len2 = s2.size();
+	std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+	for (unsigned int i = 0; i < prevCol.size(); i++)
+		prevCol[i] = i;
+	for (unsigned int i = 0; i < len1; i++) {
+		col[0] = i+1;
+		for (unsigned int j = 0; j < len2; j++)
+                        // note that std::min({arg1, arg2, arg3}) works only in C++11,
+                        // for C++98 use std::min(std::min(arg1, arg2), arg3)
+			col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+		col.swap(prevCol);
+	}
+	return prevCol[len2];
+// Needleman Wunsch Initialization
+inline void WholeGenome::initialize_score_matrix(int **score, char **trackBack, int M, int N)
+    for (int i = 0; i < M+1; i++)
+    {
+        score[0][i] = i * -1;
+        trackBack[0][i] = '-';
+    }
+    for (int i = 0; i < N+1; i++)
+    {
+        score[i][0] = i * -1;
+        trackBack[i][0] = '|';
+    }
+    trackBack[0][0] = '*';
+int WholeGenome::needleman_wunsch(string S1, string S2, string &R1, string &R2)
+    int M = S1.length();
+    int N = S2.length();
+    /*
+    N
+    N
+    N
+    N
+    N
+    N
+    so the matrix is N*M
+    */
+    int **score = new int *[N+1];
+    for (int i = 0; i <= N; i++)
+    {
+        score[i] = new int [M+1];
+    }
+    char **trackBack = new char *[N+1];
+    // * for match, - for ->, | for moving downward
+    for (int i = 0; i <= N; i++)
+    {
+        trackBack[i] = new char [M+1];
+    }
+    R1 = "";
+    R2 = "";
+    initialize_score_matrix(score, trackBack, M, N);
+    for (int i = 1; i <=N; i++)
+    {
+        for (int k = 1; k <= M; k++)
+        {
+            char S1_k = S1[k-1];
+            char S2_i = S2[i-1];
+            int matchingCost = score[i-1][k-1];
+            if(S1_k != S2_i) matchingCost--;
+            int rightCost = score[i][k-1] - 1;
+            int downCost = score[i-1][k] - 1;
+            if (matchingCost > rightCost && matchingCost > downCost)
+            {
+                score[i][k] = matchingCost;
+                trackBack[i][k] = '*';
+            }else if(rightCost >= downCost)
+            {
+                score[i][k] = rightCost;
+                trackBack[i][k] = '-';
+            }else
+            {
+                score[i][k] = downCost;
+                trackBack[i][k] = '|';
+            }
+        }
+    }
+    //trackBack
+    int n = N;
+    int m = M;
+    while(n > 0 || m > 0)
+    {
+        if (trackBack[n][m] == '*')
+        {
+            R1 += S1[m-1];
+            R2 += S2[n-1];
+            n--;
+            m--;
+        }else if(trackBack[n][m] == '-')
+        {
+            R1 += S1[m-1];
+            R2 += '-';
+            m--;
+        }else if(trackBack[n][m] == '|')
+        {
+            R1 += '-';
+            R2 += S2[n-1];
+            n--;
+        }
+    }
+    reverse(R1.begin(), R1.end());
+    reverse(R2.begin(), R2.end());
+    int result = score[N][M];
+    for (int i = 0; i <= N; i++)
+    {
+        delete score[i];
+        delete trackBack[i];
+    }
+    delete score;
+    delete trackBack;
+    return result;
+void WholeGenome::GenerateAltVector(string ref, string alt, vector<string> & alt_vector){
+    if(ref.size() == 0) return;
+    string ref_match = "";
+    string alt_match = "";
+    needleman_wunsch(ref, alt, ref_match, alt_match);
+    int current_ref_index = -1;
+    for(int i = 0; i < ref.size(); i++){
+        alt_vector.push_back("");
+    }
+    for(int i = 0; i < ref_match.size(); i++){
+        if(ref_match[i] == '-'){
+            if(current_ref_index < 0){
+                alt_vector[0].push_back(alt_match[i]);
+            }else{
+                alt_vector[current_ref_index].push_back(alt_match[i]);
+            }
+        }else if(alt_match[i] == '-'){
+            // pass
+            current_ref_index ++;
+        }else{
+            current_ref_index ++;
+            if(current_ref_index >= ref.size()){
+                alt_vector[ref.size()-1].push_back(alt_match[i]);
+            }
+            alt_vector[current_ref_index].push_back(alt_match[i]);
+        }
+    }
+    return;
+bool WholeGenome::ParallelClustering(){
+    // parallel by chr
+    variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+    }
+    int parallel_steps = chrom_num / thread_num;
+    if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+    int chr_id = 0;
+    for(int i = 0; i < parallel_steps; i++){
+        vector<thread> threads;
+        for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                if(ref_variant_by_chrid[chr_id]->size() > 0 && que_variant_by_chrid[chr_id]->size() > 0){
+                    threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+                }
+            }
+            chr_id ++;
+        }
+        if(chr_id < chrom_num){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                SingleThreadClustering(chr_id);
+            }
+            chr_id ++;
+        }
+        std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+        threads.clear();
+    }
+    for(int i = 0; i < chrom_num; i++){
+        if(variant_cluster_by_chrid[i]->size() > 0){
+            variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+        }
+    }
+    // test output
+    //dout << endl;
+    map<int, int> size_num;
+    map<int, int> size_chrid;
+    for(int i = 0; i < chrom_num; i++){
+        //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+        for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+            int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+            if(size_num.find(temp_size) != size_num.end()){
+                size_num[temp_size] ++;
+            }else{
+                size_num[temp_size] = 1;
+            }
+            if(size_chrid.find(temp_size) == size_chrid.end()){
+                size_chrid[temp_size] = i;
+            }
+        }
+    }
+    //cout << endl;
+    //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+    //    dout << it->first << ": " << it->second << endl;
+    //}
+//    cout << endl;
+//    cout << "size and location:" << endl;
+//    for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+//        dout << it->first << ": " << it->second << endl;
+//    }
+        // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        variant_cluster_by_chrid[j]->clear();
+        delete variant_cluster_by_chrid[j];
+    }
+    delete[] variant_cluster_by_chrid;
+    return true;
+bool WholeGenome::ParallelMatching(){
+bool WholeGenome::TBBMatching()
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+    int sequence_length = (int)sequence.length();
+    //cout << sequence_length << "," << unit_threshold << endl;
+    if(sequence_length == 1) return true;
+    transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+    int end_index = sequence_length / 2 + 1;
+    bool final_checking = false;
+    int repeat_threshold = min(end_index-1, unit_threshold);
+    for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+        bool is_tandem_repeat = true;
+        int repeat_time = 1;
+        string repeat_region = sequence.substr(0, repeat_length);
+        int start_position = repeat_length;
+        while (start_position < sequence_length) {
+            if (start_position + repeat_length > sequence_length)
+                break;
+            string matching_region = sequence.substr(start_position, repeat_length);
+            if (matching_region != repeat_region) {
+                is_tandem_repeat = false;
+                break;
+            }
+            start_position += repeat_length;
+            repeat_time ++;
+        }
+        if (is_tandem_repeat && repeat_time > 1) {
+            final_checking = true;
+            break;
+        }
+    }
+    return final_checking;
+// preprocess
+bool WholeGenome::MatchVariantListInThread(int thread_index, 
+    int threshold_index,
+    int chr_id,
+    vector<DiploidVariant> & variant_list,
+    int cluster_id){
+    //===================================================
+    sort(variant_list.begin(), variant_list.end());
+    // decide reference sequence
+    vector<DiploidVariant> separate_var_list[2];
+    vector<Interval> intervals;
+    // separate into ref and que
+    int total_mil = 0;
+    int total_mdl = 0;
+    int min_pos = genome_sequences[chr_id].length() + 1;
+    int max_pos = -1;
+    for (int i = 0; i < variant_list.size(); i++) {
+        int flag = 0;
+        if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+        int pos = variant_list[i].pos;
+        separate_var_list[flag].push_back(variant_list[i]);
+        total_mil += variant_list[i].mil;
+        total_mdl += variant_list[i].mdl;
+        auto ref_sequence = variant_list[i].ref;
+        auto alt_sequences = variant_list[i].alts;
+        min_pos = min(pos, min_pos);
+        max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+        int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+        intervals.push_back(Interval(pos, end_pos));
+    }
+    min_pos = max(min_pos - 1, 0);
+    max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+    if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+        //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+        return false;
+    }
+    if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            // here we need to push back for all mode_index
+            //complex_match_records[thread_index]->push_back(match_record);
+            int edit_distance = CalculateEditDistance(tv, 0, 0);
+            for(int mi = 0; mi < mode_index_list.size(); mi ++){
+                int mode_i = mode_index_list[mi];
+                //if(mi == 0){
+                // this line should be recovered
+                match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+                //}else{
+                //    match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$"+to_string(match_records_by_mode_by_thread[thread_index][0]->size()));
+                    // use dollor to represent that it is the same
+                //}
+                baseline_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+                query_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+                baseline_total_edit_distance[thread_index][threshold_index]->at(mode_i) +=  edit_distance;
+                query_total_edit_distance[thread_index][threshold_index]->at(mode_i) += edit_distance;
+                //calculate the edit distance
+            }
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+    }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+        int flag = 0;
+        if(separate_var_list[1].size() == 1) flag = 1;
+        int r_flag = 1-flag;
+        if(separate_var_list[r_flag].size() > 4){
+            int total_r_mdl = 0;
+            int total_r_mil = 0;
+            for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+                DiploidVariant var = separate_var_list[r_flag][k];
+                int var_mdl = var.mdl;
+                int var_mil = var.mil;
+                int ref_length = var.ref.length();
+                total_r_mdl += var_mdl;
+                total_r_mil += var_mil;
+            }
+            if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+        }
+    }
+    separate_var_list[0].clear();
+    separate_var_list[1].clear();
+    // remove singular variant
+    // [todo] try removing this filter to see running time changes
+    vector<bool> appliable_flag;
+    int total_change = total_mil+total_mdl;
+    if(variant_list.size() > EASY_MATCH_VAR_NUM){
+        for(int k = 0; k < variant_list.size(); k++){
+            DiploidVariant cur_var = variant_list[k];
+            int max_change = max(cur_var.mil, cur_var.mdl);
+            if(max_change > total_change-max_change){
+                appliable_flag.push_back(false);
+                //dout << "this variant is removed" << endl;
+            }else{
+                appliable_flag.push_back(true);
+            }
+        }
+    }else{
+        for(int k = 0; k < variant_list.size(); k++){
+            appliable_flag.push_back(true);
+        }
+    }
+    string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+    ToUpper(subsequence); // subsequence only contains upper char
+    int offset = min_pos;
+    int subsequence_length = max_pos - min_pos;
+    // have subsequence in hand
+    //generate decision point
+    multimap<int, int> * choices_by_pos[2];
+    // choice by pos is to also equal to var by pos
+    for(int i = 0; i < 2; i++){
+        choices_by_pos[i] = new multimap<int, int>();
+    }
+    for(int index = 0; index < variant_list.size(); index++){
+        if(!appliable_flag[index]) continue;
+        // remove decision point if not applicable
+        int pos = variant_list[index].pos - offset;
+        int flag = 0;
+        if(variant_list[index].flag) flag = 1;
+        choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+        //dout << pos << index << endl;
+    }
+    vector<Interval> mergered_intervals = merge(intervals);
+//    unordered_map<int, bool> sync_points;
+//    for(int i = 0; i < mergered_intervals.size(); i++){
+//        sync_points[mergered_intervals[i].end-offset] = true;
+//    }
+    vector<int> sync_points;
+    for(int i = 0; i < mergered_intervals.size(); i++){
+        sync_points.push_back(mergered_intervals[i].end-offset);
+    }
+    if(sync_points.back() < subsequence.size() - 1){
+        sync_points.push_back(subsequence.size()-1);
+    }
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int i = 0; i < score_unit_list.size(); i++){
+        score_unit = score_unit_list[i];
+        for(int j = 0; j < match_mode_list.size(); j++){
+            match_mode = match_mode_list[j];
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                score_scheme = score_scheme_list[k];
+                bool method2 = MatchingSingleClusterBaseExtending(
+                                            cluster_id,
+                                            thread_index,
+                                            variant_list,
+                                            subsequence,
+                                            offset,
+                                            choices_by_pos,
+                                            sync_points,
+                                            chr_id,
+                                            score_unit,
+                                            match_mode,
+                                            score_scheme,
+                                            threshold_index);
+            }
+        }
+    }
+    for(int i = 0; i < 2; i++){
+        delete choices_by_pos[i];
+    }
+    //delete choices_by_pos;
+    return true;
+// transfer indicator to variant 
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+	for (int cluster_id = start; cluster_id < end; cluster_id++) {
+        if(cluster_id >= variants_by_cluster.size()) break;
+        //dout << cluster_id << endl;
+        //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+        vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+        if(vi_list.size() <= 1) continue;
+        // create variant_list from vi_list;
+        for(int t = 0; t < threshold_num; t++){
+            double quality_threshold = threshold_list[t];
+            vector<DiploidVariant> variant_list;
+            int chr_id = -1;
+            for(int i = 0; i < vi_list.size(); i++){
+                VariantIndicator vi = vi_list[i];
+                chr_id = vi.chr_id;
+                int var_id = vi.var_id;
+                DiploidVariant var;
+                if(vi.refer){
+                    var = ref_variant_by_chrid[chr_id]->at(var_id);
+                }else{
+                    var = que_variant_by_chrid[chr_id]->at(var_id);
+                }
+                if(var.qual < quality_threshold) continue;
+                variant_list.push_back(var);
+            }
+            if(chr_id == -1 || chr_id >= chrom_num){
+                cout << "[VarMatch] Error in matching single cluster" << endl;
+                continue;
+            }
+            MatchVariantListInThread(thread_index, 
+                                    t,
+                                    chr_id,
+                                    variant_list,
+                                    cluster_id);
+        }
+        //if(method1 != method2){
+        //    cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+        //}
+	}
+	return true;
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+int WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+    for(int i = 0; i < 2; i++){
+        if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+            pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+            var_range = choices_by_pos[i]->equal_range(pos);
+            for(auto it = var_range.first; it != var_range.second; ++it){
+                int var_index = (*it).second;
+                if(sp.choice_vector[var_index] <= MEANING_CHOICE_BOUND) return var_index;
+            }
+            // you need to make choices now
+            // if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+            //     // no choice made at current pos
+            //     return true;
+            // }
+        }
+    }
+    return -1;
+// if match_mode == 1, i.e. variant match mode, only check one sequence
+// otherwise, check two sequences
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+    if(match_mode == 0){
+        //bool equal_sequences = false;
+        // same ref position, same donor length, same donor sequence, keep
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+           sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            for(int i = 0; i < 2; i++){
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }else{
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            //for(int i = 0; i < 2; i++)
+            int i = 0;
+            {
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+                                   multimap<int, int> * choices_by_pos[],
+                                   const string & reference_sequence,
+                                   vector<int> & sync_points,
+                                   int match_mode,
+                                   int & variant_need_decision){
+    //-1 operation fail, path deleted
+    //0 operation succeed
+    //1 operation fail, need to make decision first, then extend
+    //2 path reached end, need to check if good
+    if(sp.reached_sync_num >= sync_points.size()) return -1;
+    int start_pos = sp.current_genome_pos + 1;
+    int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+    for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+        // before make decision, we need to check if the equal property still holds
+        int variant_need_decision_ = PathNeedDecision(sp, choices_by_pos, next_genome_pos);
+        if(variant_need_decision_ >= 0){
+            // check equal property
+            int statu = CheckPathEqualProperty(sp, match_mode);
+            if(statu == -1) return -1;
+            variant_need_decision = variant_need_decision_;
+            return 1; // need decision on next position
+        }
+        // else extend one nt
+        for(int i = 0; i < 4; i++){
+            if(match_mode == 1){
+                if(i%2 != 0) continue;
+            }
+            if(sp.string_sequences[i][next_genome_pos] == "."){
+                sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+            }else{
+                sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+            }
+        }
+        sp.current_genome_pos = next_genome_pos;
+    }
+    // reaches the end of end_pos
+    sp.reached_sync_num ++;
+    if(sp.reached_sync_num >= sync_points.size()){
+        // last sync point is the end of ref genome sequence
+        if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+           sp.donor_sequences[1] == sp.donor_sequences[3]){
+            return 2;
+       }else{
+            //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+            //PrintPath(sp);
+            return -1;
+       }
+    }
+    return CheckPathEqualProperty(sp, match_mode);
+    // first try to converge, then extend
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+                                int choice,
+                                int score_unit,
+                                int match_mode,
+                                int score_scheme){
+    int score = 0;
+    if(choice <= NOT_USE) return score;
+    if(score_unit == 0){
+        score = 1;
+    }else if(score_unit == 1){
+        if(match_mode == 0){
+            if(choice == -1){
+                score += ScoreEditDistance(dv, 0);
+            }else if(choice == -2){
+                score += ScoreEditDistance(dv, 1);
+            }else if(choice == 0){
+                score += ScoreEditDistance(dv, 0);
+                if(dv.multi_alts && !dv.zero_one_var){
+                    score += ScoreEditDistance(dv, 1);
+                }
+            }else{
+                score += ScoreEditDistance(dv, 0);
+                score += ScoreEditDistance(dv, 1);
+            }
+        }else{
+            score += ScoreEditDistance(dv, choice);
+        }
+    }
+    if(score_scheme == 0){
+        return score;
+    }else if(score_scheme == 1 || score_scheme == 2){
+        if(dv.flag == false && score_scheme == 1){
+            return score;
+        }else if(dv.flag && score_scheme == 2){
+            return score;
+        }else{
+            return 0;
+        }
+    }
+// this is the special function to calculate edit distance
+int WholeGenome::CalculateEditDistance(DiploidVariant & dv,
+                                int choice,
+                                int match_mode){
+    int score = 0;
+    if(choice <= NOT_USE) return score;
+    if(match_mode == 0){
+        if(choice == -1){
+            score += ScoreEditDistance(dv, 0);
+        }else if(choice == -2){
+            score += ScoreEditDistance(dv, 1);
+        }else if(choice == 0){
+            score += ScoreEditDistance(dv, 0);
+            if(dv.multi_alts && !dv.zero_one_var){
+                score += ScoreEditDistance(dv, 1);
+            }
+        }else{
+            score += ScoreEditDistance(dv, 0);
+            score += ScoreEditDistance(dv, 1);
+        }
+    }else{
+        score += 2 * ScoreEditDistance(dv, choice);
+    }
+    return score;
+// function no longer used, move to VariantMakeDecisionNoGenotype
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // to maintain existance
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){ //here do not have to change anything
+                alts[1] = var.alts[1];
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                int y = 0;
+                // for each strain
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.multi_alts){ // here do not have to change anything
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    //for(int y = 0; y < 2; y++)
+                    int y = 0;
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            // should be a deletion
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            // should be equal at current position
+                            // can be an insertion, as long as current position is the same
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(choice_applicable){
+                    candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    alts[0] = var.alts[c];
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    int y = 0;
+                    int k = 0;
+                    for(; k < ref.length()-1; k++){
+                        if(k < alts[y].length()){
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+                            }
+                            // else change nothing
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                    // hence k == ref.length()-1, the last position
+                    if(k < alts[y].length()){
+                        string alt_part = alts[y].substr(k, alts[y].length()-k);
+                        if(alt_part.length() > 1){
+                            if(alt_part[0] == ref[k]){
+                                if(path.string_sequences[x*2+y][pos+k] == "."){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                }
+                            }else{
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }
+                    }else{
+                        path.string_sequences[x*2+y][pos+k] = "";
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+bool WholeGenome::AppendChangedSp(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index,
+                         int c)
+    int pos = sp.current_genome_pos+1;
+    SequencePath path = sp;
+    if(c == NOT_USE){
+        path.choice_vector[variant_index] = c;
+        sequence_path_list.push_back(path);
+        return true;
+    }
+    pair<int, int> var_choice[2];
+    int x = 0;
+    int var_index = variant_index;
+    DiploidVariant var = variant_list[var_index];
+    if(var.flag) x = 1;
+    string ref = var.ref;
+    string alts[2];
+    if(c == -1){
+        alts[0] = ref;
+        alts[1] = var.alts[0];
+    }else if(c == -2){
+        alts[0] = ref;
+        alts[1] = var.alts[1];
+    }else if(c >= 0){
+        // c == 0 or 1
+        alts[0] = var.alts[c];
+        alts[1] = alts[0];
+        if(var.multi_alts && !var.zero_one_var){
+            // choose 1 or 0
+            alts[1] = var.alts[1- c];
+        }else{
+            // c is 0, choose 0 or -1
+            if(var.heterozygous) alts[1] = ref;
+        }
+    }else{
+        dout << "Unrecognized choice" << endl;
+    }
+    path.score += CalculateScore(var,
+                                 c,
+                                 score_unit,
+                                 match_mode,
+                                 score_scheme);
+    ToUpper(ref);
+    ToUpper(alts[0]);
+    ToUpper(alts[1]);
+    for(int y = 0; y < 2; y++){
+        // iterate two alts
+        string alt = alts[y];
+        if(alt == ref) continue;
+        vector<string> alt_vector;
+        GenerateAltVector(ref, alt, alt_vector);
+        int k = 0;
+        for(; k < ref.length()-1; k++){
+            if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+                path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+            }
+            // else changes nothing
+        }
+        // hence k == ref.length()-1, the last position
+        assert(k == ref.length()-1);
+        string alt_part = alt_vector[k];
+        if(alt_part.length() > 0){
+            if(alt_part.length() > 1){
+                if(alt_part[0] == ref[k]){
+                    if(path.string_sequences[x*2+y][pos+k] == "."){
+                        path.string_sequences[x*2+y][pos+k] = alt_part;
+                    }else{
+                        path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                    }
+                }else{
+                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                }
+            }else{
+                if(ref[k] != alt_vector[k][0]){
+                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                }
+            }
+        }else{
+            path.string_sequences[x*2+y][pos+k] = "";
+        }
+    }
+    // choice made
+    path.choice_vector[variant_index] = c;
+    //dout << "after decision at variant " << variant_index << endl;
+    //PrintPath(path);
+    sequence_path_list.push_back(path);
+    return true;
+// Question: when you make decision, do you also need to align?
+// Answer: No, as it makes no difference, so currently you can skip alignment
+bool WholeGenome::VariantMakeDecision(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index)
+    int pos = sp.current_genome_pos+1;
+    int var_index = variant_index;
+    DiploidVariant var = variant_list[var_index];
+    // also this variant may not be used
+    AppendChangedSp(sp,
+                variant_list,
+                sequence_path_list,
+                reference_sequence,
+                score_unit,
+                match_mode,
+                score_scheme,
+                var_index,
+                NOT_USE);
+    int i = 0;
+    if(var.flag) i = 1;
+    //PrintVariant(var);
+    // check if current var influence
+    string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+    string alts[2];
+    alts[0] = var.alts[0];
+    alts[1] = alts[0];
+    if(var.multi_alts && !var.zero_one_var){
+        alts[1] = var.alts[1];
+    }else if(var.heterozygous){
+        alts[1] = ref;
+    }
+    // not just purely consider if a vqriant can be applied, but if a choice
+    int skiped_y = -1;
+    if(alts[1] == ref) skiped_y = 1;
+    bool choice_applicable = true;
+    for(int k = 0; k < ref.length(); k++){
+    // for each ref char
+        for(int y = 0; y < 2; y++){
+            // for each strain
+            if(y == skiped_y) continue;
+            if(sp.string_sequences[i*2+y][k+pos] != "."){
+                // decision in this area has already been made
+                if(k >= alts[y].length()){
+                    choice_applicable = false;
+                    break;
+                }else{
+                    if(ref[k] != alts[y][k]){
+                        choice_applicable = false;
+                        break;
+                    }
+                }
+            }
+        }
+        if(!choice_applicable) break;
+    }
+    if(choice_applicable){
+        //candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+        AppendChangedSp(sp,
+                        variant_list,
+                        sequence_path_list,
+                        reference_sequence,
+                        score_unit,
+                        match_mode,
+                        score_scheme,
+                        var_index,
+                        0);
+    }
+    if(var.heterozygous){
+        //if heterozygous, then there is another choice, check if it is applicable
+        string temp = alts[0];
+        alts[0] = alts[1];
+        alts[1] = temp;
+        skiped_y = -1;
+        if(alts[0] == ref) skiped_y = 0;
+        choice_applicable = true;
+        for(int k = 0; k < ref.length(); k++){
+        // for each ref char
+            for(int y = 0; y < 2; y++){
+                // for each strain
+                if(skiped_y == y) continue;
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        // should be a deletion
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        // should be equal at current position
+                        // can be an insertion, as long as current position is the same
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(!choice_applicable) break;
+        }
+        if(choice_applicable){
+            if(var.multi_alts && !var.zero_one_var){
+                //candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                AppendChangedSp(sp,
+                        variant_list,
+                        sequence_path_list,
+                        reference_sequence,
+                        score_unit,
+                        match_mode,
+                        score_scheme,
+                        var_index,
+                        1);
+            }else{
+                //candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+                AppendChangedSp(sp,
+                        variant_list,
+                        sequence_path_list,
+                        reference_sequence,
+                        score_unit,
+                        match_mode,
+                        score_scheme,
+                        var_index,
+                        -1);
+            }
+        }
+    }
+    if(var.multi_alts && var.zero_one_var){
+        // here contains another two combinations  alt1/ref and ref/alt1
+        alts[0] = var.alts[1];
+        alts[1] = ref;
+        choice_applicable = true;
+        int y = 0;
+        for(int k = 0; k < ref.length(); k++){
+        // for each ref char
+            {
+                // for each strain
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        // should be a deletion
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        // should be equal at current position
+                        // can be an insertion, as long as current position is the same
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(!choice_applicable) break;
+        }
+        if(choice_applicable){
+            //candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+            AppendChangedSp(sp,
+                variant_list,
+                sequence_path_list,
+                reference_sequence,
+                score_unit,
+                match_mode,
+                score_scheme,
+                var_index,
+                1);
+        }
+        alts[0] = ref;
+        alts[1] = var.alts[1];
+        choice_applicable = true;
+        y = 1;
+        for(int k = 0; k < ref.length(); k++){
+        // for each ref char
+            {
+                // for each strain
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        // should be a deletion
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        // should be equal at current position
+                        // can be an insertion, as long as current position is the same
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(!choice_applicable) break;
+        }
+        if(choice_applicable){
+            //candidate_choices[i].push_back(pair<int, int>(var_index, -2));
+            AppendChangedSp(sp,
+                variant_list,
+                sequence_path_list,
+                reference_sequence,
+                score_unit,
+                match_mode,
+                score_scheme,
+                var_index,
+                -2);
+        }
+    }
+// no genotype means you only need to maintain one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::VariantMakeDecisionNoGenotype(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index)
+    int pos = sp.current_genome_pos+1;
+    int var_index = variant_index;
+    DiploidVariant var = variant_list[var_index];
+    // also this variant may not be used
+    AppendChangedSpNoGenotype(sp,
+                variant_list,
+                sequence_path_list,
+                reference_sequence,
+                score_unit,
+                match_mode,
+                score_scheme,
+                var_index,
+                NOT_USE);
+    int i = 0;
+    if(var.flag) i = 1;
+    //PrintVariant(var);
+    // check if current var influence
+    string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+    string alts[2];
+    alts[0] = var.alts[0];
+    alts[1] = alts[0];
+    if(var.multi_alts && !var.zero_one_var){
+        alts[1] = var.alts[1];
+    }else if(var.heterozygous){
+        alts[1] = ref;
+    }
+    bool choice_applicable = true;
+    for(int k = 0; k < ref.length(); k++){
+    // for each ref char
+        int y = 0;
+        {
+            if(sp.string_sequences[i*2+y][k+pos] != "."){
+                // decision in this area has already been made
+                if(k >= alts[y].length()){
+                    choice_applicable = false;
+                    break;
+                }else{
+                    if(ref[k] != alts[y][k]){
+                        choice_applicable = false;
+                        break;
+                    }
+                }
+            }
+        }
+        if(!choice_applicable) break;
+    }
+    if(choice_applicable){
+        //candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+        AppendChangedSpNoGenotype(sp,
+                        variant_list,
+                        sequence_path_list,
+                        reference_sequence,
+                        score_unit,
+                        match_mode,
+                        score_scheme,
+                        var_index,
+                        0);
+    }
+    // if variants is 0/1, then it does not make sense to apply reference, as it is the same as not_use
+    // if variants is 0/1 but contains multi alts, then should try another alt
+    // if variants is 1/2 , then should try another alt
+    // if variants is 1/1 or 2/2 then should not try another alt
+    // but here we do not care the phasing
+    // so as long as variant has multi_alts, use another alt
+    if(var.multi_alts){
+        //if it contains multi alts, then there is another choice, check if it is applicable
+        string temp = alts[0];
+        alts[0] = alts[1];
+        alts[1] = temp;
+        choice_applicable = true;
+        for(int k = 0; k < ref.length(); k++){
+        // for each ref char
+            int y = 0;
+            {
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        // should be a deletion
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        // should be equal at current position
+                        // can be an insertion, as long as current position is the same
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(!choice_applicable) break;
+        }
+        if(choice_applicable){
+            AppendChangedSp(sp,
+                    variant_list,
+                    sequence_path_list,
+                    reference_sequence,
+                    score_unit,
+                    match_mode,
+                    score_scheme,
+                    var_index,
+                    1);
+        }
+    }   
+bool WholeGenome::AppendChangedSpNoGenotype(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index,
+                         int c)
+    int pos = sp.current_genome_pos+1;
+    SequencePath path = sp;
+    if(c == NOT_USE){
+        path.choice_vector[variant_index] = c;
+        sequence_path_list.push_back(path);
+        return true;
+    }
+    pair<int, int> var_choice[2];
+    int x = 0;
+    int var_index = variant_index;
+    DiploidVariant var = variant_list[var_index];
+    if(var.flag) x = 1;
+    string ref = var.ref;
+    string alts[2];
+    if(c == 0 || c == 1){
+        // c == 0 or 1
+        alts[0] = var.alts[c];
+    }else{
+        dout << "Unrecognized choice" << endl;
+    }
+    path.score += CalculateScore(var,
+                                 c,
+                                 score_unit,
+                                 match_mode,
+                                 score_scheme);
+    ToUpper(ref);
+    ToUpper(alts[0]);
+    int y = 0;
+    string alt = alts[y];
+    vector<string> alt_vector;
+    GenerateAltVector(ref, alt, alt_vector);
+    int k = 0;
+    for(; k < ref.length()-1; k++){
+        if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+            path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+        }
+        // else changes nothing
+    }
+    // hence k == ref.length()-1, the last position
+    assert(k == ref.length()-1);
+    string alt_part = alt_vector[k];
+    if(alt_part.length() > 0){
+        if(alt_part.length() > 1){
+            if(alt_part[0] == ref[k]){
+                if(path.string_sequences[x*2+y][pos+k] == "."){
+                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                }else{
+                    path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                }
+            }else{
+                path.string_sequences[x*2+y][pos+k] = alt_part;
+            }
+        }else{
+            if(ref[k] != alt_vector[k][0]){
+                path.string_sequences[x*2+y][pos+k] = alt_part;
+            }
+        }
+    }else{
+        path.string_sequences[x*2+y][pos+k] = "";
+    }
+    // choice made
+    path.choice_vector[variant_index] = c;
+    //dout << "after decision at variant " << variant_index << endl;
+    //PrintPath(path);
+    sequence_path_list.push_back(path);
+    return true;
+// this function is no longer used, because you can not make decison for one position at once,
+// there might be multiple variants in one position,
+// so a better way to do this is to make decision for one variant at a time
+// previously I just want to save some time, but ignore the multiple variant condition
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            //PrintVariant(var);
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts && !var.zero_one_var){
+                alts[1] = var.alts[1];
+            }else if(var.heterozygous){
+                alts[1] = ref;
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            int skiped_y = -1;
+            if(alts[1] == ref) skiped_y = 1;
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                for(int y = 0; y < 2; y++){
+                    // for each strain
+                    if(y == skiped_y) continue;
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(!choice_applicable) break;
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.heterozygous){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                skiped_y = -1;
+                if(alts[0] == ref) skiped_y = 0;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    for(int y = 0; y < 2; y++){
+                        // for each strain
+                        if(skiped_y == y) continue;
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    if(var.multi_alts && !var.zero_one_var){
+                        candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                    }else{
+                        candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+                    }
+                }
+            }
+            if(var.multi_alts && var.zero_one_var){
+                // here contains another two combinations  alt1/ref and ref/alt1
+                alts[0] = var.alts[1];
+                alts[1] = ref;
+                choice_applicable = true;
+                int y = 0;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    {
+                        // for each strain
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                }
+                alts[0] = ref;
+                alts[1] = var.alts[1];
+                choice_applicable = true;
+                y = 1;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    {
+                        // for each strain
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    candidate_choices[i].push_back(pair<int, int>(var_index, -2));
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+//                    string temp_sequence = reference_sequence.substr(pos, 1);
+//                    path.string_sequences[x*2][pos] = temp_sequence;
+//                    path.string_sequences[x*2+1][pos] = temp_sequence;
+//                }else{
+                    // set score
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    if(c == -1){
+                        alts[0] = ref;
+                        alts[1] = var.alts[0];
+                    }else if(c == -2){
+                        alts[0] = ref;
+                        alts[1] = var.alts[1];
+                    }else{
+                        // c == 0 or 1
+                        alts[0] = var.alts[c];
+                        alts[1] = alts[0];
+                        if(var.multi_alts && !var.zero_one_var){
+                            // choose 1 or 0
+                            alts[1] = var.alts[1- c];
+                        }else{
+                            // c is 0, choose 0 or -1
+                            if(var.heterozygous) alts[1] = ref;
+                        }
+                    }
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    ToUpper(alts[1]);
+                    for(int y = 0; y < 2; y++){
+                        // iterate two alts
+                        string alt = alts[y];
+                        vector<string> alt_vector;
+                        GenerateAltVector(ref, alt, alt_vector);
+                        int k = 0;
+                        for(; k < ref.length()-1; k++){
+                            if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+                                path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+                            }
+                            // else changes nothing
+                        }
+                        // hence k == ref.length()-1, the last position
+                        assert(k == ref.length()-1);
+                        string alt_part = alt_vector[k];
+                        if(alt_part.length() > 0){
+                            if(alt_part.length() > 1){
+                                if(alt_part[0] == ref[k]){
+                                    if(path.string_sequences[x*2+y][pos+k] == "."){
+                                        path.string_sequences[x*2+y][pos+k] = alt_part;
+                                    }else{
+                                        path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                    }
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }else{
+                                if(ref[k] != alt_vector[k][0]){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            // choice made
+            dout << "after decision at pos " << pos << endl;
+            PrintPath(path);
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+bool WholeGenome::PathMakeDecisionBackup(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+void WholeGenome::PrintPath(SequencePath & sp){
+    cout << "- Sequence Path:" << endl;
+    cout << "@ String Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        for(int j = 0; j < sp.string_sequences[i].size(); j++){
+            cout << sp.string_sequences[i][j] << " ";
+        }
+        cout << endl;
+    }
+    cout << "@ Donor Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        cout << sp.donor_sequences[i] << endl;
+    }
+    cout << "@ Removable: " << sp.removable << endl;
+    for(int i = 0; i < sp.choice_vector.size(); i++){
+        cout << sp.choice_vector[i] << ",";
+    }
+    cout << endl;
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+                                                    int thread_index,
+                                                    vector<DiploidVariant> & variant_list,
+                                                    string & subsequence,
+                                                    int offset,
+                                                    multimap<int, int> * choices_by_pos[],
+                                                    vector<int> & sync_points,
+                                                    int chr_id,
+                                                    int score_unit,
+                                                    int match_mode,
+                                                    int score_scheme,
+                                                    int threshold_index){
+    //--------------for unit test------------------------------
+    //dout << variant_list.size() << endl;
+    //int chr_id = 0;
+    //-------------end unit test-------------------------------
+    //cout << chr_id << "," << cluster_index << "," << variant_list.size() << endl;
+    // so a legal sync_points vector contains at least two
+    // first is the end of variant, there should be at least one variant
+    // second is the end of subsequence, there should be at least one nt not influenced by a variant
+    list<SequencePath> current_path_list;
+    list<SequencePath> next_path_list;
+    SequencePath sp(subsequence.length(), variant_list.size());
+    SequencePath best_path = sp;
+    current_path_list.push_back(sp);
+    while(current_path_list.size() != 0){
+        bool reach_sync_point = true;
+        // extend path before reaches sync points
+        //cout << "\t" << current_path_list.size() << endl;
+        while(current_path_list.size() != 0){
+            SequencePath path = current_path_list.front();
+            current_path_list.pop_front();
+            //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+            //PrintPath(path);
+            int variant_need_decision = -1;
+            int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode, variant_need_decision);
+            //cout << variant_need_decision << endl;
+            //PrintPath(path);
+            if(is_extend == -1){
+                // discard path
+                continue;
+            }
+            else if(is_extend == 0){
+                next_path_list.push_back(path);
+                // here the path is supposed to reach the next sync point
+                // so it goes into next path list, and decrease the number of current path list
+            }else if(is_extend == 1){
+                if(match_mode == 0){
+                    // PathMakeDecision(path,
+                    //                  variant_list,
+                    //                  choices_by_pos,
+                    //                  current_path_list,
+                    //                  subsequence,
+                    //                  score_unit,
+                    //                  match_mode,
+                    //                  score_scheme);
+                    VariantMakeDecision(path,
+                                         variant_list,
+                                         current_path_list,
+                                         subsequence,
+                                         score_unit,
+                                         match_mode,
+                                         score_scheme,
+                                         variant_need_decision);
+                }else{
+                    // PathMakeDecisionNoGenotype(path,
+                    //                            variant_list,
+                    //                            choices_by_pos,
+                    //                            current_path_list,
+                    //                            subsequence,
+                    //                            score_unit,
+                    //                            match_mode,
+                    //                            score_scheme);
+                    VariantMakeDecisionNoGenotype(path,
+                                                 variant_list,
+                                                 current_path_list,
+                                                 subsequence,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme,
+                                                 variant_need_decision);
+                }
+            }else if(is_extend == 2){
+                if(path.score > best_path.score){
+                    best_path = path; // only when you reach the very end can you be considered as best path
+                    //PrintPath(best_path);
+                }
+            }
+        }
+        current_path_list = next_path_list;
+        next_path_list.clear();
+        if(current_path_list.size() > 0){
+            //int current_genome_pos = current_path_list.front().current_genome_pos;
+            // after revise, we do not need this check
+            //if(sync_points.find(current_genome_pos) != sync_points.end()){
+                //dout << "converge paths at position: " << current_genome_pos << endl;
+                //dout << "before converge: " << current_path_list.size() << endl;
+                ConvergePaths(current_path_list);
+                //dout << "after converge: " << current_path_list.size() << endl;
+            //}
+        }
+    }
+    current_path_list.clear();
+    next_path_list.clear();
+    // print best_path
+    if(best_path.score <= 0) return false;
+    //dout << "new method: " << best_path.score << endl;
+    //PrintPath(best_path);
+    //==========================output ======================
+    int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+    //return true;
+    if(match_mode == 0){
+        ConstructMatchRecord(best_path,
+                             variant_list,
+                             subsequence,
+                             offset,
+                             thread_index,
+                             chr_id,
+                             mode_index,
+                             threshold_index);
+    }else{
+        ConstructMatchRecordNoGenotype(best_path,
+                                       variant_list,
+                                       subsequence,
+                                       offset,
+                                       thread_index,
+                                       chr_id,
+                                       mode_index,
+                                       threshold_index);
+    }
+    return true;
+int GetMatchmodeFromModeIndex(int mode_index){
+    int result = mode_index;
+    result >>= 2;
+    result &= 1;
+    return result;
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    int truth_edit_distance = 0;
+    int predict_edit_distance = 0;
+    bool need_match_record = false;
+    if (threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[1];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+    for (int i = 0; i < 2; i++) {
+        for (int var_index = 0; var_index < variant_list.size(); var_index++) {
+            DiploidVariant variant = variant_list[var_index];
+            if(variant.flag != i) continue;
+            //The exact wording from the C++ standard is (§4.7/4): "If the source type is bool,
+            // the value false is converted to zero and the value true is converted to one."
+            int phasing = best_path.choice_vector[var_index];
+            if(phasing <= NOT_USE) continue;
+            int edit_distance = CalculateEditDistance(variant, phasing, 0);
+            if(i == 0){
+                truth_num ++;
+                truth_edit_distance += edit_distance;
+            }else{
+                predict_num ++;
+                predict_edit_distance += edit_distance;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1";
+                    if(variant.heterozygous){
+                        if(variant.multi_alts && !variant.zero_one_var){
+                            phasing_string += "|2";
+                        }else{
+                            phasing_string += "|0";
+                        }
+                    }else{
+                        phasing_string += "|1";
+                    }
+                }else if(phasing == 1){
+                    if(variant.multi_alts && !variant.zero_one_var){
+                        phasing_string += "2|1";
+                    }else if(variant.multi_alts && variant.zero_one_var){
+                        phasing_string += "2|0";
+                    }else{
+                        phasing_string += "0|1";
+                    }
+                }else if(phasing == -1){
+                    phasing_string += "0|1";
+                }else if(phasing == -2){
+                    phasing_string += "0|2";
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+        }
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+    }
+    if(need_match_record){
+        match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+        match_record += "\t" + to_string(best_path.score) + "\n";
+        //complex_match_records[thread_index]->push_back(match_record);
+        // this line should be recovered
+        match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+    baseline_total_edit_distance[thread_index][threshold_index]->at(mode_index) += truth_edit_distance;
+    query_total_edit_distance[thread_index][threshold_index]->at(mode_index) += predict_edit_distance;
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                                 vector<DiploidVariant> & variant_list,
+                                                 string & subsequence,
+                                                 int offset,
+                                                 int thread_index,
+                                                 int chr_id,
+                                                 int mode_index,
+                                                 int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    int truth_edit_distance = 0;
+    int predict_edit_distance = 0;
+    bool need_match_record = false;
+    if(threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[0];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    //if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+    for (int i = 0; i < 2; i++) {
+        for (int var_index = 0; var_index < variant_list.size(); var_index++) {
+            DiploidVariant variant = variant_list[var_index];
+            if(variant.flag != i) continue;
+            //The exact wording from the C++ standard is (§4.7/4): "If the source type is bool,
+            // the value false is converted to zero and the value true is converted to one."
+            int phasing = best_path.choice_vector[var_index];
+            if(phasing <= NOT_USE) continue;
+            int edit_distance = CalculateEditDistance(variant, phasing, 1);
+            if(i == 0){
+                truth_num ++;
+                truth_edit_distance += edit_distance;
+            }else{
+                predict_num ++;
+                predict_edit_distance += edit_distance;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1|1";
+                }else if(phasing == 1){
+                    phasing_string += "2|2";
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+        }
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+    }
+    if(need_match_record){
+       match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+       match_record += "\t" + to_string(best_path.score) + "\n";
+       //complex_match_records[thread_index]->push_back(match_record);
+       // this line should be recovered
+       match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+    baseline_total_edit_distance[thread_index][threshold_index]->at(mode_index) += truth_edit_distance;
+    query_total_edit_distance[thread_index][threshold_index]->at(mode_index) += predict_edit_distance;
+// function no longer used, backup old method
+void WholeGenome::ConstructMatchRecordBackup(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool need_match_record = false;
+    if (threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[1];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1";
+                    if(variant.heterozygous){
+                        if(variant.multi_alts && !variant.zero_one_var){
+                            phasing_string += "|2";
+                        }else{
+                            phasing_string += "|0";
+                        }
+                    }else{
+                        phasing_string += "|1";
+                    }
+                }else if(phasing == 1){
+                    if(variant.multi_alts && !variant.zero_one_var){
+                        phasing_string += "2|1";
+                    }else if(variant.multi_alts && variant.zero_one_var){
+                        phasing_string += "2|0";
+                    }else{
+                        phasing_string += "0|1";
+                    }
+                }else if(phasing == -1){
+                    phasing_string += "0|1";
+                }else if(phasing == -2){
+                    phasing_string += "0|2";
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+	}
+    if(need_match_record){
+    	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+    	match_record += "\t" + to_string(best_path.score) + "\n";
+    	//complex_match_records[thread_index]->push_back(match_record);
+        // this line should be recovered
+        match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+void WholeGenome::ConstructMatchRecordNoGenotypeBackup(SequencePath & best_path,
+                                                 vector<DiploidVariant> & variant_list,
+                                                 string & subsequence,
+                                                 int offset,
+                                                 int thread_index,
+                                                 int chr_id,
+                                                 int mode_index,
+                                                 int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool need_match_record = false;
+    if(threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[0];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    //if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            if (phasing == -1) continue;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1|1";
+                }else if(phasing == 1){
+                    phasing_string += "2|2";
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+	}
+    if(need_match_record){
+	   match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+	   match_record += "\t" + to_string(best_path.score) + "\n";
+	   //complex_match_records[thread_index]->push_back(match_record);
+       // this line should be recovered
+       match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+    bool truth_same = false;
+    bool query_same = false;
+    if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+       a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+        truth_same = true;
+    }
+    else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+            a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+                truth_same = true;
+            }
+    if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+       a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+        query_same = true;
+    }
+    else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+            a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+                query_same = true;
+            }
+    if(truth_same && query_same) return true;
+    return false;
+bool IsRemovable(SequencePath & s){ return s.removable;}
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+    //dout << "===========start converge===================" << endl;
+    int path_num = path_list.size();
+    if(path_num <= 1) return;
+    for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+        SequencePath  ref_path = *i;
+        if(ref_path.removable) continue;
+        if(!ref_path.same_donor_len) continue;
+        list<SequencePath>::iterator j = i;
+        ++j;
+        for(; j != path_list.end(); ++j){
+            SequencePath que_path = *j;
+            if(que_path.removable) continue;
+            if(!que_path.same_donor_len) continue;
+            //dout << "Comparing following paths: " << endl;
+            //PrintPath(ref_path);
+            //PrintPath(que_path);
+            if(DonorLengthEqual(ref_path, que_path)){
+                if(ref_path.score >= que_path.score){
+                    (*j).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*j));
+                }else{
+                    (*i).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*i));
+                    break;
+                }
+            }
+            //dout << "-    -     -   -   -   -   -  - - -" << endl;
+        }
+    }
+    path_list.remove_if(IsRemovable);
+int WholeGenome::test() {
+	genome_sequences[0] = "GTCAGCCGG";
+	DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+	DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+	DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+	DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+	DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+	DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+	DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+	DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+    //complex_match_records = new vector<string>*[1];
+    //complex_match_records[0] = new vector<string>;
+	//vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+	vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+	//cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+	//cout << complex_match_records[0]->at(0) << endl;
+	return 0;
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+	int start = 0;
+	int cluster_number = variants_by_cluster.size(); // cluster number
+	int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+	int cluster_step = cluster_number / thread_num; // assign clusters to threads
+	if (cluster_step * thread_num < cluster_number) cluster_step++;
+	int end = start + cluster_step;
+	//initialize vector size
+	//complex_match_records = new vector<string>*[thread_num];
+	match_records_by_mode_by_thread = new vector<string>**[thread_num];
+    //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+	for(int i = 0; i < thread_num; i++){
+        match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            match_records_by_mode_by_thread[i][j] = new vector<string>;
+        }
+	}
+    baseline_total_match_num = new vector<int>** [thread_num];
+    query_total_match_num = new vector<int> ** [thread_num];
+    baseline_total_edit_distance = new vector<int>** [thread_num];
+    query_total_edit_distance = new vector<int>** [thread_num];
+    for(int i = 0; i < thread_num; i++){
+        baseline_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+        query_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+        baseline_total_edit_distance[i] = new vector<int> * [ROC_SAMPLE_NUM];
+        query_total_edit_distance[i] = new vector<int>* [ROC_SAMPLE_NUM];
+        for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+            baseline_total_match_num[i][j] = new vector<int>;
+            baseline_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+            query_total_match_num[i][j] = new vector<int>;
+            query_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+            baseline_total_edit_distance[i][j] = new vector<int>;
+            baseline_total_edit_distance[i][j]->resize(MATCH_MODE_NUM, 0);
+            query_total_edit_distance[i][j] = new vector<int>;
+            query_total_edit_distance[i][j]->resize(MATCH_MODE_NUM, 0);
+        }
+    }
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+		threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+		start = end;
+		end = start + cluster_step;
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	if (start >= variants_by_cluster.size()) {
+		dout << "[Error] index out of map range" << endl;
+	}
+	else {
+		ClusteringMatchInThread(start, end, i);
+	}
+	// call join() on each thread in turn before this function?
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    //output all results
+    cout << "writing results..." << endl;
+    ofstream output_stat_file;
+    output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+    cout << "=========VarMatch Result Stat.=======" << endl;
+    string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tqual_threshold\tbaseline_match_num\tquery_match_num\tquery_total_num\tbaseline_total_ED\tquery_total_ED";
+    cout << stat_head_string << endl;
+    output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+    output_stat_file << "##Query:"<< query_variant_total_num << endl;
+    output_stat_file << stat_head_string << endl;
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                int total_ref_complex = 0;
+                int total_que_complex = 0;
+                string threshold_string = "";
+                string baseline_match_num_string = "";
+                string query_match_num_string = "";
+                string query_total_num_string = "";
+                string baseline_edit_distance_string = "";
+                string query_edit_distance_string = "";
+                for(int t = 0; t < threshold_num; t++){
+                    threshold_string += to_string(threshold_list[t]);
+                    int baseline_match_num_by_threshold_by_mode = 0;
+                    int query_match_num_by_threshold_by_mode = 0;
+                    int baseline_edit_distance_by_threshold_by_mode = 0;
+                    int query_edit_distance_by_threshold_by_mode = 0;
+                    for(int i = 0; i < thread_num; i++){
+                        baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i][t]->at(mode_index);
+                        query_match_num_by_threshold_by_mode += query_total_match_num[i][t]->at(mode_index);
+                        baseline_edit_distance_by_threshold_by_mode += baseline_total_edit_distance[i][t]->at(mode_index);
+                        query_edit_distance_by_threshold_by_mode += query_total_edit_distance[i][t]->at(mode_index);
+                    }
+                    baseline_match_num_string += to_string(baseline_match_num_by_threshold_by_mode);
+                    query_match_num_string += to_string(query_match_num_by_threshold_by_mode);
+                    query_total_num_string += to_string((int)(query_variant_total_num * (1-per_list[t])) );
+                    baseline_edit_distance_string += to_string(baseline_edit_distance_by_threshold_by_mode);
+                    query_edit_distance_string += to_string(query_edit_distance_by_threshold_by_mode);
+                    if(t < threshold_num-1){
+                        threshold_string += ",";
+                        baseline_match_num_string += ",";
+                        query_match_num_string += ",";
+                        query_total_num_string += ",";
+                        baseline_edit_distance_string += ",";
+                        query_edit_distance_string += ",";
+                    }
+                }
+                string total_match_num_string = to_string(score_unit) + "\t" +
+                                                to_string(match_mode) + "\t" + 
+                                                to_string(score_scheme) + "\t" +
+                                                threshold_string + "\t" +
+                                                baseline_match_num_string + "\t" + 
+                                                query_match_num_string + "\t" + 
+                                                query_total_num_string;// + "\t" + to_string(mode_index);
+                cout << total_match_num_string << "\t" << baseline_edit_distance_string << "\t" << query_edit_distance_string << endl;;
+                output_stat_file << total_match_num_string << endl;
+            }
+        }
+    }
+    output_stat_file.close();
+    int bench_mode_index = GetIndexFromMatchScore(0, 0, 0);
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+                ofstream output_complex_file;
+                output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+                output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+                output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+                output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+                for(int i = 0; i < thread_num; i++){
+                    for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+                        if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+                            //if(match_records_by_mode_by_thread[i][mode_index]->at(k)[0] == '$'){
+                                //int bench_mode_index = stoi(match_records_by_mode_by_thread[i][mode_index]->at(k).erase(0,1));
+                                //output_complex_file << match_records_by_mode_by_thread[i][0]->at(k);
+                            //}else{
+                                output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+                            //}
+                        }
+                    }
+                }
+                output_complex_file.close();
+            }
+        }
+    }
+    // clear all matching records
+	for(int i = 0; i < thread_num; i++){
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            delete match_records_by_mode_by_thread[i][j];
+        }
+        for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+            delete baseline_total_match_num[i][j];
+            delete query_total_match_num[i][j];
+            delete baseline_total_edit_distance[i][j];
+            delete query_total_edit_distance[i][j];
+        }
+        delete[] match_records_by_mode_by_thread[i];
+        delete[] baseline_total_match_num[i];
+        delete[] query_total_match_num[i];
+        delete[] baseline_total_edit_distance[i];
+        delete[] query_total_edit_distance[i];
+	}
+	delete[] match_records_by_mode_by_thread;
+    delete[] baseline_total_match_num;
+    delete[] query_total_match_num;
+    delete[] baseline_total_edit_distance;
+    delete[] query_total_edit_distance;
+//[TODO] unit test
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+	int left_index = pos;
+	if (genome_sequences[chr_id].size() == 0) return -1;
+	if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+			if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+				parsimonious_ref.pop_back();
+				parsimonious_alt0.pop_back();
+				parsimonious_alt1.pop_back();
+				change_in_allels = true;
+			}
+            // else do not make further changes
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+			left_index--;
+			char left_char = toupper(genome_sequences[chr_id][left_index]);
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt0 = left_char + parsimonious_alt0;
+			parsimonious_alt1 = left_char + parsimonious_alt1;
+		}
+	}
+	while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+            parsimonious_ref[0] == parsimonious_alt1[0] &&
+            parsimonious_ref.size() > 1 &&
+            parsimonious_alt0.size() > 1 &&
+            parsimonious_alt1.size() > 1)
+    {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt0.erase(0, 1);
+		parsimonious_alt1.erase(0, 1);
+        left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+	}
+	return left_index;
+void WholeGenome::SingleThreadClustering(int chr_id) {
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = 0;
+	int c_end = 0;
+    sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+    sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+    int ref_size = ref_variant_by_chrid[chr_id]->size();
+    int que_size = que_variant_by_chrid[chr_id]->size();
+    //dout << chr_id << "," << ref_size << "," << que_size << endl;
+    int ref_index = 0;
+    int que_index = 0;
+    bool not_first = false;
+    DiploidVariant snp;
+    vector<VariantIndicator> vi_list;
+    while (ref_index < ref_size || que_index < que_size) {
+		bool take_que = true;
+		if(ref_index < ref_size && que_index < que_size){
+            if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+                take_que = false;
+            }
+		}else if(ref_index < ref_size){
+            take_que = false;
+		}
+        int var_index;
+		if(take_que){
+            snp = que_variant_by_chrid[chr_id]->at(que_index);
+            //cout << "q |" << que_index << "," << snp.pos << endl;
+            var_index = que_index;
+            que_index++;
+		}else{
+            snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+            //cout << "r |" << ref_index << "," << snp.pos << endl;
+            var_index = ref_index;
+            ref_index++;
+		}
+		// check if need to separator clusters
+		if (not_first) {
+			c_end = snp.pos;
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+                    vi_list.clear();
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_start = 0; // re-assign c_start
+				}
+			}
+		}
+		c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+        VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+        vi_list.push_back(current_variant_indicator);
+		//cluster_vars_map[cluster_index].push_back(snp);
+		if(!not_first) not_first = true;
+		int ref_length = (int)(snp.ref.length());
+		int flag = 0;
+        if(snp.flag) flag = 1;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        ins_len[flag] += snp.mil;
+        del_len[flag] += snp.mdl;
+	}
+    if(vi_list.size() > 0){
+        variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+    }
+int WholeGenome::ReadReferenceVariants(string filename){
+    return ReadWholeGenomeVariant(filename, false);
+int WholeGenome::ReadQueryVariants(string filename){
+    return ReadWholeGenomeVariant(filename, true);
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+    ReadWholeGenomeSequence(genome_seq);
+    baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+    ref_vcf_filename = ref_vcf;
+void WholeGenome::Compare(string query_vcf,
+	string output_prefix,
+    bool detail_results,
+    int score_unit_,
+    int match_mode_,
+    int score_scheme_)
+    // initialize query variant data structure
+	que_vcf_filename = query_vcf;
+    this->output_prefix = output_prefix;
+    this->detail_results = detail_results;
+    score_unit_indicator = score_unit_;
+    match_mode_indicator = match_mode_;
+    score_scheme_indicator = score_scheme_;
+    if(score_scheme_indicator == 3){
+        DirectMatch(ref_vcf_filename, query_vcf, match_mode_, output_prefix);
+        return;
+    }
+    query_variant_total_num = ReadQueryVariants(query_vcf);
+    if(score_unit_indicator == -1){
+        score_unit_list.push_back(0);
+        score_unit_list.push_back(1);
+    }else{
+        score_unit_list.push_back(score_unit_indicator);
+    }
+    if(match_mode_indicator == -1){
+        match_mode_list.push_back(0);
+        match_mode_list.push_back(1);
+    }else{
+        match_mode_list.push_back(match_mode_indicator);
+    }
+    if(score_scheme_indicator == -1){
+        score_scheme_list.push_back(0);
+        score_scheme_list.push_back(1);
+        score_scheme_list.push_back(2);
+    }else{
+        score_scheme_list.push_back(score_scheme_indicator);
+    }
+    for(int i = 0; i < score_unit_list.size(); i++){
+        for(int j = 0; j < match_mode_list.size(); j++){
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+                mode_index_list.push_back(mode_index);  // so that I can directly know how many mode, do not need to calculate all the time
+            }
+        }
+    }
+    cout << "Baseline VCF: " << ref_vcf_filename << endl;
+    cout << "Query VCF: " << query_vcf << endl;
+    cout << "========VCF Stat.==========" << endl;
+    cout << "Total Number of VCF Entries: " << endl;
+    cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+    cout << "parallel clustering..." << endl;
+    ParallelClustering();
+    cout << "matching variants..." << endl;
+    ClusteringMatchMultiThread();
+    // most clustering results are cleared inside ParallelClustering function except the following one
+    // which is needed for matching
+    variants_by_cluster.clear();
+    // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        que_variant_by_chrid[j]->clear();
+        //delete que_variant_by_chrid[j];
+    }
+    //delete[] que_variant_by_chrid;
+    query_variant_strings.clear();
+    query_variant_total_num = 0;
+    threshold_list.clear();
+    threshold_num = 0;
+    // The following three matching results are cleared inside ClusteringMatchMultiThread function
+    // match_records_by_mode_by_thread;
+    // baseline_total_match_num;
+    // query_total_match_num;
+    score_unit_list.clear();
+    match_mode_list.clear();
+    score_scheme_list.clear();
+    mode_index_list.clear();
+    return;
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf, int match_mode_, string output_prefix)
+    //dout << "direct match" << endl;
+    match_mode_indicator = match_mode_;
+    //int ref_variant_num = ReadReferenceVariants(ref_vcf);
+    int que_variant_num = ReadQueryVariants(query_vcf);
+    //dout << que_variant_num << endl;
+    int match_num = 0;
+    ofstream output_stat_file;
+    output_stat_file.open(output_dir + "/" + output_prefix+".direct");
+    for(int i = 0; i < chrom_num; i++){
+        if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+            continue;
+        //[TODO] not the right way to do it, at least need multimap
+        multimap<int, int> ref_variant_by_pos;
+        for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            ref_variant_by_pos.insert(pair<int, int>(pos, j));
+        }
+        for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = que_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+                continue;
+            pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+            var_range = ref_variant_by_pos.equal_range(pos);
+            for(auto it = var_range.first; it != var_range.second; ++it){
+                int ref_index = (*it).second;
+                DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+                if (match_mode_indicator != 1 && var == ref_var){
+                    match_num ++;
+                    string matched_variant = chrname_by_chrid[i] + "\t" + to_string(ref_var.pos) + "\t" + ref_var.ref + "\t";
+                    output_stat_file << matched_variant << endl;
+                    break;
+                }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+                    match_num ++;
+                    break;
+                }
+            }
+        }
+    }
+    output_stat_file.close();
+    dout << "matched variants: " << match_num << endl;
diff --git a/src/wholegenome.h b/src/wholegenome.h
new file mode 100644
index 0000000..2817ee9
--- /dev/null
+++ b/src/wholegenome.h
@@ -0,0 +1,367 @@
+#pragma once
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+typedef struct VariantIndicator{
+    VariantIndicator(int chr_id_ = -1,
+    int var_id_ = -1,
+    bool refer_ = true) :
+    chr_id(chr_id_),
+    var_id(var_id_),
+    refer(refer_){}
+    char chr_id;
+    int var_id;
+    bool refer;
+typedef struct Interval {
+    int start;
+    int end;
+    Interval() : start(0), end(0) {}
+    Interval(int s, int e) : start(s), end(e) {}
+class SequencePath{
+    SequencePath(int n, int v)
+    {
+        reference_length = n;
+        for(int i = 0; i < 4; i++){
+            string_sequences[i].resize(n, ".");
+            // default value is "."
+            donor_sequences[i] = "";
+        }
+        current_genome_pos = -1;
+        score = 0;
+        removable = false;
+        same_donor_len = false;
+        current_equal_donor_pos[0] = -1;
+        current_equal_donor_pos[1] = -1;
+        reached_sync_num = 0;
+        for(int i = 0; i < v; i++){
+            choice_vector.push_back(-89);
+        }
+    }
+    int reference_length;
+    vector<string> string_sequences[4];
+    map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+    // one choice is a pair: variant id, phasing index
+    int current_genome_pos;
+    string donor_sequences[4];
+    int current_equal_donor_pos[2];
+    int score;
+    bool removable;
+    bool same_donor_len;
+    int reached_sync_num;
+    vector<int> choice_vector;
+class WholeGenome{
+    int chrom_num;
+    int thread_num;
+    string ref_vcf_filename;
+    string que_vcf_filename;
+    int baseline_variant_total_num;
+    int query_variant_total_num;
+    vector<string> baseline_variant_strings;
+    vector<string> query_variant_strings;
+    bool detail_results;
+    //int thread_num; VCF->DiploidVariant->WholeGenome
+    map<string, int> chrid_by_chrname;
+    map<int, string> chrname_by_chrid;
+    map<string, int> chrname_dict;
+    map<int, string> genome_sequences;
+    vector<DiploidVariant> ** ref_variant_by_chrid;
+    vector<DiploidVariant> ** que_variant_by_chrid;
+    vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+    // so here cluster is represented as vector<vector<VariantIndicator>>
+    // and we create a list of pointers point to cluster
+    // and we hold the point to that list
+    vector<vector<VariantIndicator>> variants_by_cluster;
+    vector<string> *** match_records_by_mode_by_thread;
+    //vector<int> *** baseline_matches_by_mode_by_thread;
+    //vector<int> *** query_matches_by_mode_by_thread;
+    vector<int> *** baseline_total_match_num;
+    vector<int> *** query_total_match_num;
+    vector<int> *** baseline_total_edit_distance;
+    vector<int> *** query_total_edit_distance;
+    //map<float, int> *** tp_qual_num_by_mode_by_thread;
+    //map<float, int> *** fp_qual_num_by_mode_by_thread;
+    //map<float, int> query_total_qual_num;
+    string output_prefix;
+    string output_dir;
+    // copy the above into this.
+    int score_unit_indicator;
+    int match_mode_indicator;
+    int score_scheme_indicator;
+    vector<int> score_unit_list;
+    vector<int> match_mode_list;
+    vector<int> score_scheme_list;
+    vector<int> mode_index_list;
+    vector<double> threshold_list;
+    int threshold_num;
+    vector<float> per_list;
+    bool ReadWholeGenomeSequence(string filename);
+    bool ReadGenomeSequenceList(string filename);
+    int ReadWholeGenomeVariant(string filename, bool flag);
+    bool ReadVariantFileList(string filename);
+    int ReadReferenceVariants(string filename);
+    int ReadQueryVariants(string filename);
+    bool ParallelClustering(); // parallel by chr id
+    bool ParallelMatching(); // parallel by task
+    bool TBBMatching();
+    void SingleThreadClustering(int chr_id);
+    //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+    //override
+    bool ClusteringMatchInThread(int start, int end, int thread_index);
+    void ClusteringMatchMultiThread();
+    int NormalizeVariantSequence(int pos,
+                             string & parsimonious_ref,
+                             string & parsimonious_alt0,
+                             string & parsimonious_alt1,
+                             int chr_id);
+    struct compInterval {
+        bool operator()(const Interval &a, const Interval &b) const {
+            return a.start<b.start;
+        }
+    };
+    vector<Interval> merge(vector<Interval> &intervals) {
+        sort(intervals.begin(),intervals.end(),compInterval());
+        vector<Interval> results;
+        for(int i=0; i<intervals.size(); i++) {
+            if(results.empty() || results.back().end < intervals[i].start)  // no overlap
+                results.push_back(intervals[i]);
+            else   // overlap
+                results.back().end = max(results.back().end, intervals[i].end);
+        }
+        return results;
+    }
+    int PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+    int PathExtendOneStep(SequencePath& sp,
+                          multimap<int, int> * choices_by_pos[],
+                          const string & reference_sequence,
+                          vector<int> & sync_points,
+                          int match_mode,
+                          int & variant_need_decision);
+    bool PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    bool VariantMakeDecision(SequencePath& sp,
+                             vector<DiploidVariant> & variant_list,
+                             list<SequencePath> & sequence_path_list,
+                             const string & reference_sequence,
+                             int score_unit,
+                             int match_mode,
+                             int score_scheme,
+                             int variant_index);
+    bool VariantMakeDecisionNoGenotype(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index);
+    bool AppendChangedSp(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index,
+                         int c);
+    bool AppendChangedSpNoGenotype(SequencePath& sp,
+                         vector<DiploidVariant> & variant_list,
+                         list<SequencePath> & sequence_path_list,
+                         const string & reference_sequence,
+                         int score_unit,
+                         int match_mode,
+                         int score_scheme,
+                         int variant_index,
+                         int c);
+    bool PathMakeDecisionBackup(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    bool MatchingSingleClusterBaseExtending(int cluster_index,
+                                            int thread_index,
+                                            vector<DiploidVariant> & variant_list,
+                                            string & subsequence,
+                                            int offset,
+                                            multimap<int, int> * choices_by_pos[],
+                                            vector<int> & sync_points,
+                                            int chr_id,
+                                            int score_unit,
+                                            int match_mode,
+                                            int score_scheme,
+                                            int threshold_index);
+    bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+    void ConvergePaths(list<SequencePath> & path_list);
+    int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+    int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+    int EditDistance(const std::string& s1, const std::string& s2);
+    bool PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    void ConstructMatchRecord(SequencePath & best_path,
+                               vector<DiploidVariant> & variant_list,
+                               string & subsequence,
+                               int offset,
+                               int thread_index,
+                               int chr_id,
+                               int mode_index,
+                               int threshold_index);
+    void ConstructMatchRecordBackup(SequencePath & best_path,
+                               vector<DiploidVariant> & variant_list,
+                               string & subsequence,
+                               int offset,
+                               int thread_index,
+                               int chr_id,
+                               int mode_index,
+                               int threshold_index);
+    void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index);
+    void ConstructMatchRecordNoGenotypeBackup(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index);
+    int CalculateScore(DiploidVariant & dv,
+                       int choice,
+                       int score_unit,
+                       int match_mode,
+                       int score_scheme);
+    int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+    bool ClearQuery();
+    inline void ToUpper(string & s){
+        transform(s.begin(), s.end(), s.begin(), ::toupper);
+    }
+    bool CheckTandemRepeat(string sequence, int unit_threshold);
+    bool MatchVariantListInThread(int thread_index, 
+        int threshold_index,
+        int chr_id,
+        vector<DiploidVariant> & variant_list,
+        int cluster_id);
+    void initialize_score_matrix(int **score, char **trackBack, int M, int N);
+    int needleman_wunsch(string S1, string S2, string &R1, string &R2);
+    void GenerateAltVector(string ref, string alt, vector<string> & alt_vector);
+    int CalculateEditDistance(DiploidVariant & dv,
+                                int choice,
+                                int match_mode);
+    WholeGenome(int thread_num_,
+                string output_dir_,
+                bool pr_curves_);
+    ~WholeGenome();
+    void ReadRef(string genome_seq, 
+      string ref_vcf);
+    void Compare(string query_vcf,
+        string output_prefix,
+        bool detail_results,
+        int score_unit_,
+        int match_mode_,
+        int score_scheme_);
+    void DirectMatch(string ref_vcf,
+                string query_vcf,
+                int match_mode_,
+                string output_prefix);
+    int test(); // for direct test
+    void PrintPath(SequencePath & sp);
+    const static int MATCH_MODE_NUM = 16;
+    const static int VAR_LEN = 100;
+    const static int MAX_REPEAT_LEN = 1000;
+    const static int ROC_SAMPLE_NUM = 5;
+    const static int MEANING_CHOICE_BOUND = -10;
+    const static int NOT_USE = -9;
+    const static int EASY_MATCH_VAR_NUM = 5;
diff --git a/src/wholegenome_backup.cpp b/src/wholegenome_backup.cpp
new file mode 100644
index 0000000..fbfe718
--- /dev/null
+++ b/src/wholegenome_backup.cpp
@@ -0,0 +1,2056 @@
+#include "wholegenome.h"
+using namespace std;
+WholeGenome::WholeGenome(int thread_num_, 
+    int score_unit_, 
+    int match_mode_, 
+    int score_scheme_,
+    string output_dir_){
+    thread_num = thread_num_;
+    chrom_num = 24;
+    //thread_num = thread_num_;
+    score_unit_indicator = score_unit_;
+    match_mode_indicator = match_mode_;
+    score_scheme_indicator = score_scheme_;
+    output_dir = output_dir_;
+    if(score_unit_indicator == -1){
+        score_unit_list.push_back(0);
+        score_unit_list.push_back(1);
+    }else{
+        score_unit_list.push_back(score_unit_indicator);
+    }
+    if(match_mode_indicator == -1){
+        match_mode_list.push_back(0);
+        match_mode_list.push_back(1);
+    }else{
+        match_mode_list.push_back(match_mode_indicator);
+    }
+    if(score_scheme_indicator == -1){
+        score_scheme_list.push_back(0);
+        score_scheme_list.push_back(1);
+        score_scheme_list.push_back(2);
+    }else{
+        score_scheme_list.push_back(score_scheme_indicator);
+    }
+    for(int i = 0; i < score_unit_list.size(); i++){
+        for(int j = 0; j < match_mode_list.size(); j++){
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+                mode_index_list.push_back(mode_index);  // so that I can directly know how many mode, do not need to calculate all the time
+            }
+        }
+    }
+    //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+    ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+	for (int j = 0; j < chrom_num; j++) {
+		ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+	}
+    // chr_id starts from 0
+	for(int j = 1; j <= 22; j++){
+        string chr_name = to_string(j);
+        chrname_dict[chr_name] = j-1;
+        chr_name = "chr"+chr_name;
+        chrname_dict[chr_name] = j-1;
+	}
+	chrname_dict["X"] = 22;
+	chrname_dict["chrX"] = 22;
+	chrname_dict["Y"] = 23;
+	chrname_dict["chrY"] = 23;
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+    int result = 0;
+    result |= score_unit & 1;
+    result <<= 1;
+    result |= match_mode & 1;
+    result <<= 2;
+    result |= score_scheme & 3;
+    return result;
+    for(int j = 0; j < chrom_num; j++){
+        ref_variant_by_chrid[j]->clear();
+        delete ref_variant_by_chrid[j];
+    }
+    delete[] ref_variant_by_chrid;
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+    std::ifstream input(filename);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+        return false;
+    }
+    std::string line, name, content;
+    int real_chrom_num = 0;
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                if(chrname_dict.find(name) == chrname_dict.end()){
+                    cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+                    return false;
+                }
+                int chr_id = chrname_dict[name];
+                chrid_by_chrname[name] = chr_id;
+                chrname_by_chrid[chr_id] = name;
+                genome_sequences[chr_id] = content;
+                real_chrom_num++;
+                name.clear();
+            }
+            if( !line.empty() ){
+                name = split(line, ' ')[0].substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        if(chrname_dict.find(name) == chrname_dict.end()){
+            cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+            return false;
+        }
+        int chr_id = chrname_dict[name];
+        chrid_by_chrname[name] = chr_id;
+        chrname_by_chrid[chr_id] = name;
+        genome_sequences[chr_id] = content;
+        real_chrom_num++;
+    }
+    // test
+    chrom_num = real_chrom_num;
+    //dout << "detected chromosome num: " << chrom_num << endl;
+//    for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+//        cout << it->first << ":" << (it->second).length();
+//    }
+    return true;
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+    int total_num = 0;
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return -1;
+	}
+    map<int, int> quality_num;
+	int genotype_index = -1;
+	char genotype_separator = '/';
+	//int genome_sequence_length = genome_sequence.length();
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+		//dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		//if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#') {
+			continue;
+		}
+		auto columns = split(line, '\t');
+		if (columns.size() < 10) {
+			if(match_mode_indicator != 1){
+                cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+                cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+                match_mode_indicator = 1;
+                continue;
+            }
+            if(columns.size() < 6){
+                cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+                cout << "[VarMatch] skip current variant: " << line << endl;
+                continue;
+            }
+		}
+		string chr_name = columns[0];
+		auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+		auto ref = columns[3];
+		auto alt_line = columns[4];
+		float quality = stof(columns[5]);
+        if(flag){
+            int quality_int = (int) quality;
+            //dout << quality_int << endl;
+            if(quality_que_totalnum.find(quality_int) != quality_que_totalnum.end()){
+                quality_que_totalnum[quality_int] += 1.0;
+            }else{
+                quality_que_totalnum[quality_int] = 1.0;
+            }
+        }
+		ToUpper(ref);
+		ToUpper(alt_line);
+		bool is_heterozygous_variant = false;
+		bool is_multi_alternatives = false;
+		if (columns.size() >= 10) {
+			if (genotype_index < 0) {
+                auto formats = split(columns[8], ':');
+                for (int i = 0; i < formats.size(); i++) {
+                    if (formats[i] == "GT") {
+                        genotype_index = i;
+                        break;
+                    }
+                }
+                if(genotype_index < 0){
+                    cout << "[VarMatch] VCF entry does not contain genotype information" << endl;
+                    continue;
+                }
+			}
+			auto additionals = split(columns[9], ':');
+            vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+            if(genotype_columns.size() != 2){
+                genotype_separator = '|';
+                genotype_columns = split(additionals[genotype_index], genotype_separator);
+            }
+			// normalize format of genotype: sorted, separated by |
+			if (genotype_columns.size() != 2) {
+				cout << "[VarMatch] Warning Unrecognized Genotype: " << additionals[genotype_index] << endl;
+				continue;
+			}
+			else {
+				if (genotype_columns[0] != genotype_columns[1]) {
+					is_heterozygous_variant = true;
+				}
+			}
+            if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+                continue;
+            }
+		}
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos) {
+			alt_list = split(alt_line, ',');
+			is_multi_alternatives = true;
+		}
+		else {
+			alt_list.push_back(alt_line);
+		}
+        int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+        int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+        if(is_multi_alternatives){
+            snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+            snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+        }
+        if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+            //dout << "[VarMatch] skip large INDEL with length > 50 bp" << endl;
+            continue;
+        }
+		DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality);
+		//if (normalization) {
+			//NormalizeDiploidVariant(dv);
+		//}
+        if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+            int chr_id = chrid_by_chrname[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }else{
+            int chr_id = chrname_dict[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }
+        total_num++;
+	}
+	vcf_file.close();
+	return total_num;
+bool WholeGenome::ReadVariantFileList(string filename){
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+    return EditDistance(dv.ref, dv.alts[allele_indicator]);
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+	const std::size_t len1 = s1.size(), len2 = s2.size();
+	std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+	for (unsigned int i = 0; i < prevCol.size(); i++)
+		prevCol[i] = i;
+	for (unsigned int i = 0; i < len1; i++) {
+		col[0] = i+1;
+		for (unsigned int j = 0; j < len2; j++)
+                        // note that std::min({arg1, arg2, arg3}) works only in C++11,
+                        // for C++98 use std::min(std::min(arg1, arg2), arg3)
+			col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+		col.swap(prevCol);
+	}
+	return prevCol[len2];
+bool WholeGenome::ParallelClustering(){
+    // parallel by chr
+    variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+    }
+    int parallel_steps = chrom_num / thread_num;
+    if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+    int chr_id = 0;
+    for(int i = 0; i < parallel_steps; i++){
+        vector<thread> threads;
+        for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+            }
+            chr_id ++;
+        }
+        if(chr_id < chrom_num){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                SingleThreadClustering(chr_id);
+            }
+            chr_id ++;
+        }
+        std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+        threads.clear();
+    }
+    for(int i = 0; i < chrom_num; i++){
+        variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+    }
+    // test output
+    //dout << endl;
+    map<int, int> size_num;
+    map<int, int> size_chrid;
+    for(int i = 0; i < chrom_num; i++){
+        //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+        for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+            int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+            if(size_num.find(temp_size) != size_num.end()){
+                size_num[temp_size] ++;
+            }else{
+                size_num[temp_size] = 1;
+            }
+            if(size_chrid.find(temp_size) == size_chrid.end()){
+                size_chrid[temp_size] = i;
+            }
+        }
+    }
+    //cout << endl;
+    //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+    //    dout << it->first << ": " << it->second << endl;
+    //}
+//    cout << endl;
+//    cout << "size and location:" << endl;
+//    for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+//        dout << it->first << ": " << it->second << endl;
+//    }
+        // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        variant_cluster_by_chrid[j]->clear();
+        delete variant_cluster_by_chrid[j];
+    }
+    delete[] variant_cluster_by_chrid;
+    return true;
+bool WholeGenome::ParallelMatching(){
+bool WholeGenome::TBBMatching()
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+    int sequence_length = (int)sequence.length();
+    //cout << sequence_length << "," << unit_threshold << endl;
+    if(sequence_length == 1) return true;
+    transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+    int end_index = sequence_length / 2 + 1;
+    bool final_checking = false;
+    int repeat_threshold = min(end_index-1, unit_threshold);
+    for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+        bool is_tandem_repeat = true;
+        int repeat_time = 1;
+        string repeat_region = sequence.substr(0, repeat_length);
+        int start_position = repeat_length;
+        while (start_position < sequence_length) {
+            if (start_position + repeat_length > sequence_length)
+                break;
+            string matching_region = sequence.substr(start_position, repeat_length);
+            if (matching_region != repeat_region) {
+                is_tandem_repeat = false;
+                break;
+            }
+            start_position += repeat_length;
+            repeat_time ++;
+        }
+        if (is_tandem_repeat && repeat_time > 1) {
+            final_checking = true;
+            break;
+        }
+    }
+    return final_checking;
+bool WholeGenome::MatchVariantListInThread(int thread_index,
+    int chr_id,
+    vector<DiploidVariant> & variant_list,
+    int cluster_id){
+    //===================================================
+    sort(variant_list.begin(), variant_list.end());
+    // decide reference sequence
+    vector<DiploidVariant> separate_var_list[2];
+    vector<Interval> intervals;
+    // separate into ref and que
+    int total_mil = 0;
+    int total_mdl = 0;
+    int min_pos = genome_sequences[chr_id].length() + 1;
+    int max_pos = -1;
+    for (int i = 0; i < variant_list.size(); i++) {
+        int flag = 0;
+        if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+        int pos = variant_list[i].pos;
+        separate_var_list[flag].push_back(variant_list[i]);
+        total_mil += variant_list[i].mil;
+        total_mdl += variant_list[i].mdl;
+        auto ref_sequence = variant_list[i].ref;
+        auto alt_sequences = variant_list[i].alts;
+        min_pos = min(pos, min_pos);
+        max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+        int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+        intervals.push_back(Interval(pos, end_pos));
+    }
+    min_pos = max(min_pos - 1, 0);
+    max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+    if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+        //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+        return false;
+    }
+    if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            // here we need to push back for all mode_index
+            //complex_match_records[thread_index]->push_back(match_record);
+            int qual = (int)(tv.qual);
+            for(int mi = 0; mi < mode_index_list.size(); mi ++){
+                int mode_i = mode_index_list[mi];
+                if(mi == 0){
+                    match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+                }else{
+                    match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$");
+                    // use dollor to represent that it is the same
+                }
+                baseline_total_match_num[thread_index]->at(mode_i)++;
+                query_total_match_num[thread_index]->at(mode_i)++;
+                auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_i]->end();
+                if(quality_que_matchnum_by_thread_mode[thread_index][mode_i]->find(qual) != end_it){
+                    quality_que_matchnum_by_thread_mode[thread_index][mode_i]->at(qual) += 1.0;
+                }else{
+                    quality_que_matchnum_by_thread_mode[thread_index][mode_i]->at(qual) = 1.0;
+                }
+            }
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+    }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+        int flag = 0;
+        if(separate_var_list[1].size() == 1) flag = 1;
+        int r_flag = 1-flag;
+        if(separate_var_list[r_flag].size() > 4){
+            int total_r_mdl = 0;
+            int total_r_mil = 0;
+            for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+                DiploidVariant var = separate_var_list[r_flag][k];
+                int var_mdl = var.mdl;
+                int var_mil = var.mil;
+                int ref_length = var.ref.length();
+                total_r_mdl += var_mdl;
+                total_r_mil += var_mil;
+            }
+            if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+        }
+    }
+    // remove singular variant
+    // [todo] try removing this filter to see running time changes
+    vector<bool> appliable_flag;
+    int total_change = total_mil+total_mdl;
+    for(int k = 0; k < variant_list.size(); k++){
+        DiploidVariant cur_var = variant_list[k];
+        int max_change = max(cur_var.mil, cur_var.mdl);
+        if(max_change > total_change-max_change){
+            appliable_flag.push_back(false);
+            //dout << "this variant is removed" << endl;
+        }else{
+            appliable_flag.push_back(true);
+        }
+    }
+    string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+    ToUpper(subsequence); // subsequence only contains upper char
+    int offset = min_pos;
+    int subsequence_length = max_pos - min_pos;
+    // have subsequence in hand
+    //generate decision point
+    multimap<int, int> * choices_by_pos[2];
+    // choice by pos is to also equal to var by pos
+    for(int i = 0; i < 2; i++){
+        choices_by_pos[i] = new multimap<int, int>();
+    }
+    for(int index = 0; index < variant_list.size(); index++){
+        if(!appliable_flag[index]) continue;
+        // remove decision point if not applicable
+        int pos = variant_list[index].pos - offset;
+        int flag = 0;
+        if(variant_list[index].flag) flag = 1;
+        choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+        //dout << pos << index << endl;
+    }
+    vector<Interval> mergered_intervals = merge(intervals);
+//    unordered_map<int, bool> sync_points;
+//    for(int i = 0; i < mergered_intervals.size(); i++){
+//        sync_points[mergered_intervals[i].end-offset] = true;
+//    }
+    vector<int> sync_points;
+    for(int i = 0; i < mergered_intervals.size(); i++){
+        sync_points.push_back(mergered_intervals[i].end-offset);
+    }
+    if(sync_points.back() < subsequence.size() - 1){
+        sync_points.push_back(subsequence.size()-1);
+    }
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int i = 0; i < score_unit_list.size(); i++){
+        score_unit = score_unit_list[i];
+        for(int j = 0; j < match_mode_list.size(); j++){
+            match_mode = match_mode_list[j];
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                score_scheme = score_scheme_list[k];
+                bool method2 = MatchingSingleClusterBaseExtending(
+                                            cluster_id,
+                                            thread_index,
+                                            variant_list,
+                                            subsequence,
+                                            offset,
+                                            choices_by_pos,
+                                            sync_points,
+                                            chr_id,
+                                            score_unit,
+                                            match_mode,
+                                            score_scheme);
+            }
+        }
+    }
+    return true;
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+	for (int cluster_id = start; cluster_id < end; cluster_id++) {
+        if(cluster_id >= variants_by_cluster.size()) break;
+        //dout << cluster_id << endl;
+        //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+        vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+        if(vi_list.size() <= 1) continue;
+        // create variant_list from vi_list;
+        vector<DiploidVariant> variant_list;
+        int chr_id = -1;
+        for(int i = 0; i < vi_list.size(); i++){
+            VariantIndicator vi = vi_list[i];
+            chr_id = vi.chr_id;
+            int var_id = vi.var_id;
+            DiploidVariant var;
+            if(vi.refer){
+                var = ref_variant_by_chrid[chr_id]->at(var_id);
+            }else{
+                var = que_variant_by_chrid[chr_id]->at(var_id);
+            }
+            variant_list.push_back(var);
+        }
+        if(chr_id == -1 || chr_id >= chrom_num){
+            cout << "[VarMatch] Error in matching single cluster" << endl;
+            continue;
+        }
+        MatchVariantListInThread(thread_index, 
+                                chr_id,
+                                variant_list,
+                                cluster_id);
+        //if(method1 != method2){
+        //    cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+        //}
+	}
+	return true;
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+bool WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+    for(int i = 0; i < 2; i++){
+        if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+            // you need to make choices now
+            if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+                // no choice made at current pos
+                return true;
+            }
+        }
+    }
+    return false;
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+    if(match_mode == 0){
+        //bool equal_sequences = false;
+        // same ref position, same donor length, same donor sequence, keep
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+           sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            for(int i = 0; i < 2; i++){
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }else{
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            //for(int i = 0; i < 2; i++)
+            int i = 0;
+            {
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+                                   multimap<int, int> * choices_by_pos[],
+                                   const string & reference_sequence,
+                                   vector<int> & sync_points,
+                                   int match_mode){
+    //-1 operation fail, path deleted
+    //0 operation succeed
+    //1 operation fail, need to make decision first, then extend
+    //2 path reached end, need to check if good
+    if(sp.reached_sync_num >= sync_points.size()) return -1;
+    int start_pos = sp.current_genome_pos + 1;
+    int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+    for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+        // before make decision, we need to check if the equal property still holds
+        if(PathNeedDecision(sp, choices_by_pos, next_genome_pos)){
+            // check equal property
+            int statu = CheckPathEqualProperty(sp, match_mode);
+            if(statu == -1) return -1;
+            return 1; // need decision on next position
+        }
+        // else extend one nt
+        for(int i = 0; i < 4; i++){
+            if(match_mode == 1){
+                if(i%2 != 0) continue;
+            }
+            if(sp.string_sequences[i][next_genome_pos] == "."){
+                sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+            }else{
+                sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+            }
+        }
+        sp.current_genome_pos = next_genome_pos;
+    }
+    // reaches the end of end_pos
+    sp.reached_sync_num ++;
+    if(sp.reached_sync_num >= sync_points.size()){
+        // last sync point is the end of ref genome sequence
+        if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+           sp.donor_sequences[1] == sp.donor_sequences[3]){
+            return 2;
+       }else{
+            //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+            //PrintPath(sp);
+            return -1;
+       }
+    }
+    return CheckPathEqualProperty(sp, match_mode);
+    // first try to converge, then extend
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+                                int choice,
+                                int score_unit,
+                                int match_mode,
+                                int score_scheme){
+    int score = 0;
+    if(score_unit == 0){
+        score = 1;
+    }else if(score_unit == 1){
+        if(match_mode == 0){
+            if(choice == -1){
+                score += ScoreEditDistance(dv, 0);
+            }else if(choice == 0){
+                score += ScoreEditDistance(dv, 0);
+                if(dv.multi_alts){
+                    score += ScoreEditDistance(dv, 1);
+                }
+            }else{
+                score += ScoreEditDistance(dv, 0);
+                score += ScoreEditDistance(dv, 1);
+            }
+        }else{
+            score += ScoreEditDistance(dv, choice);
+        }
+    }
+    if(score_scheme == 0){
+        return score;
+    }else if(score_scheme == 1 || score_scheme == 2){
+        if(dv.flag == false && score_scheme == 1){
+            return score;
+        }else if(dv.flag && score_scheme == 2){
+            return score;
+        }else{
+            return 0;
+        }
+    }
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // to maintain existance
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){
+                alts[1] = var.alts[1];
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                int y = 0;
+                // for each strain
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.multi_alts){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    //for(int y = 0; y < 2; y++)
+                    int y = 0;
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            // should be a deletion
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            // should be equal at current position
+                            // can be an insertion, as long as current position is the same
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(choice_applicable){
+                    candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    alts[0] = var.alts[c];
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    int y = 0;
+                    int k = 0;
+                    for(; k < ref.length()-1; k++){
+                        if(k < alts[y].length()){
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+                            }
+                            // else change nothing
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                    // hence k == ref.length()-1, the last position
+                    if(k < alts[y].length()){
+                        string alt_part = alts[y].substr(k, alts[y].length()-k);
+                        if(alt_part.length() > 1){
+                            if(alt_part[0] == ref[k]){
+                                if(path.string_sequences[x*2+y][pos+k] == "."){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                }
+                            }else{
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }
+                    }else{
+                        path.string_sequences[x*2+y][pos+k] = "";
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            //PrintVariant(var);
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){
+                alts[1] = var.alts[1];
+            }else if(var.heterozygous){
+                alts[1] = ref;
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                for(int y = 0; y < 2; y++){
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(!choice_applicable) break;
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.heterozygous){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    for(int y = 0; y < 2; y++){
+                        // for each strain
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    if(var.multi_alts){
+                        candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                    }else{
+                        candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+                    }
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+//                    string temp_sequence = reference_sequence.substr(pos, 1);
+//                    path.string_sequences[x*2][pos] = temp_sequence;
+//                    path.string_sequences[x*2+1][pos] = temp_sequence;
+//                }else{
+                    // set score
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    if(c == -1){
+                        alts[0] = ref;
+                        alts[1] = var.alts[0];
+                    }else{
+                        // c == 0 or 1
+                        alts[0] = var.alts[c];
+                        alts[1] = alts[0];
+                        if(var.multi_alts){
+                            // choose 1 or 0
+                            alts[1] = var.alts[1- c];
+                        }else{
+                            // c is 0, choose 0 or -1
+                            if(var.heterozygous) alts[1] = ref;
+                        }
+                    }
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    ToUpper(alts[1]);
+                    for(int y = 0; y < 2; y++){
+                        // iterate two alts
+                        int k = 0;
+                        for(; k < ref.length()-1; k++){
+                            if(k < alts[y].length()){
+                                if(ref[k] != alts[y][k]){
+                                    path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+                                }
+                                // else change nothing
+                            }else{
+                                path.string_sequences[x*2+y][pos+k] = "";
+                            }
+                        }
+                        // hence k == ref.length()-1, the last position
+                        if(k < alts[y].length()){
+                            string alt_part = alts[y].substr(k, alts[y].length()-k);
+                            if(alt_part.length() > 1){
+                                if(alt_part[0] == ref[k]){
+                                    if(path.string_sequences[x*2+y][pos+k] == "."){
+                                        path.string_sequences[x*2+y][pos+k] = alt_part;
+                                    }else{
+                                        path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                    }
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }else{
+                                if(ref[k] != alts[y][k]){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            // choice made
+            //dout << "after decision at pos " << pos << endl;
+            //PrintPath(path);
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+void WholeGenome::PrintPath(SequencePath & sp){
+    cout << "- Sequence Path:" << endl;
+    cout << "@ String Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        for(int j = 0; j < sp.string_sequences[i].size(); j++){
+            cout << sp.string_sequences[i][j] << " ";
+        }
+        cout << endl;
+    }
+    cout << "@ Donor Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        cout << sp.donor_sequences[i] << endl;
+    }
+    cout << "@ Removable: " << sp.removable << endl;
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+                                                    int thread_index,
+                                                    vector<DiploidVariant> & variant_list,
+                                                    string & subsequence,
+                                                    int offset,
+                                                    multimap<int, int> * choices_by_pos[],
+                                                    vector<int> & sync_points,
+                                                    int chr_id,
+                                                    int score_unit,
+                                                    int match_mode,
+                                                    int score_scheme){
+    //--------------for unit test------------------------------
+    //dout << variant_list.size() << endl;
+    //int chr_id = 0;
+    //-------------end unit test-------------------------------
+    // so a legal sync_points vector contains at least two
+    // first is the end of variant, there should be at least one variant
+    // second is the end of subsequence, there should be at least one nt not influenced by a variant
+    list<SequencePath> current_path_list;
+    list<SequencePath> next_path_list;
+    SequencePath sp(subsequence.length());
+    SequencePath best_path = sp;
+    current_path_list.push_back(sp);
+    while(current_path_list.size() != 0){
+        bool reach_sync_point = true;
+        while(current_path_list.size() != 0){
+            SequencePath path = current_path_list.front();
+            current_path_list.pop_front();
+            //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+            //PrintPath(path);
+            int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode);
+            //if(cluster_index == 220730) PrintPath(path);
+            if(is_extend == -1){
+                continue;
+            }
+            else if(is_extend == 0){
+                next_path_list.push_back(path);
+                // here the path is supposed to reach the next sync point
+            }else if(is_extend == 1){
+                if(match_mode == 0){
+                    PathMakeDecision(path,
+                                     variant_list,
+                                     choices_by_pos,
+                                     current_path_list,
+                                     subsequence,
+                                     score_unit,
+                                     match_mode,
+                                     score_scheme);
+                }else{
+                    PathMakeDecisionNoGenotype(path,
+                                               variant_list,
+                                               choices_by_pos,
+                                               current_path_list,
+                                               subsequence,
+                                               score_unit,
+                                               match_mode,
+                                               score_scheme);
+                }
+            }else if(is_extend == 2){
+                if(path.score > best_path.score){
+                    best_path = path; // only when you reach the very end can you be considered as best path
+                    //PrintPath(best_path);
+                }
+            }
+        }
+        current_path_list = next_path_list;
+        next_path_list.clear();
+        if(current_path_list.size() > 0){
+            //int current_genome_pos = current_path_list.front().current_genome_pos;
+            // after revise, we do not need this check
+            //if(sync_points.find(current_genome_pos) != sync_points.end()){
+                //dout << "converge paths at position: " << current_genome_pos << endl;
+                //dout << "before converge: " << current_path_list.size() << endl;
+                ConvergePaths(current_path_list);
+                //dout << "after converge: " << current_path_list.size() << endl;
+            //}
+        }
+    }
+    // print best_path
+    if(best_path.score <= 0) return false;
+    //dout << "new method: " << best_path.score << endl;
+    //==========================output ======================
+    int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+    if(match_mode == 0){
+        ConstructMatchRecord(best_path,
+                             variant_list,
+                             subsequence,
+                             offset,
+                             thread_index,
+                             chr_id,
+                             mode_index);
+    }else{
+        ConstructMatchRecordNoGenotype(best_path,
+                                       variant_list,
+                                       subsequence,
+                                       offset,
+                                       thread_index,
+                                       chr_id,
+                                       mode_index);
+    }
+    return true;
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool multiple_match = false;
+    if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[1];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+    vector<int> query_qual_list;
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            if (phasing == -1) phasing = 1;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+                query_qual_list.push_back((int)variant.qual);
+            }
+            string alt_string = variant.alts[0];
+            if(variant.multi_alts){
+                alt_string += "/" + variant.alts[1];
+            }
+            string phasing_string = "";
+            if(phasing == 0){
+                phasing_string += "1";
+                if(variant.heterozygous){
+                    if(variant.multi_alts){
+                        phasing_string += "|2";
+                    }else{
+                        phasing_string += "|0";
+                    }
+                }else{
+                    phasing_string += "|1";
+                }
+            }else if(phasing == 1){
+                if(variant.multi_alts){
+                    phasing_string += "2|1";
+                }else{
+                    phasing_string += "0|1";
+                }
+            }
+            string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+            vcf_record[i] += variant_record;
+            phasing_record[i] += phasing_string;
+            vcf_record[i] += ";";
+            phasing_record[i] += ";";
+		}
+        vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+        phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+	}
+    float average_count = (float)truth_num/float(predict_num);
+    auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_index]->end();
+    for(int i = 0; i < query_qual_list.size(); i++){
+        int qual = query_qual_list[i];
+        if(quality_que_matchnum_by_thread_mode[thread_index][mode_index]->find(qual) != end_it){
+            quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) += average_count;
+        }else{
+            quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) = average_count;
+        }
+    }
+    match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+    match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+    match_record += "\t" + to_string(best_path.score) + "\n";
+    //complex_match_records[thread_index]->push_back(match_record);
+    match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    baseline_total_match_num[thread_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index]->at(mode_index) += predict_num;
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                                 vector<DiploidVariant> & variant_list,
+                                                 string & subsequence,
+                                                 int offset,
+                                                 int thread_index,
+                                                 int chr_id,
+                                                 int mode_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool multiple_match = false;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[0];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    //if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+    vector<int> query_qual_list;
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            if (phasing == -1) continue;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+                query_qual_list.push_back((int)variant.qual);
+            }
+            string alt_string = variant.alts[0];
+            if(variant.multi_alts){
+                alt_string += "/" + variant.alts[1];
+            }
+            string phasing_string = "";
+            if(phasing == 0){
+                phasing_string += "1|1";
+            }else if(phasing == 1){
+                phasing_string += "2|2";
+            }
+            string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+            vcf_record[i] += variant_record;
+            phasing_record[i] += phasing_string;
+            vcf_record[i] += ";";
+            phasing_record[i] += ";";
+		}
+        vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+        phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+	}
+    match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+    match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+    match_record += "\t" + to_string(best_path.score) + "\n";
+    //complex_match_records[thread_index]->push_back(match_record);
+    match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    float average_count = (float)truth_num/float(predict_num);
+    auto end_it = quality_que_matchnum_by_thread_mode[thread_index][mode_index]->end();
+    for(int i = 0; i < query_qual_list.size(); i++){
+        int qual = query_qual_list[i];
+        if(quality_que_matchnum_by_thread_mode[thread_index][mode_index]->find(qual) != end_it){
+            quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) += average_count;
+        }else{
+            quality_que_matchnum_by_thread_mode[thread_index][mode_index]->at(qual) = average_count;
+        }
+    }
+    baseline_total_match_num[thread_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index]->at(mode_index) += predict_num;
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+    bool truth_same = false;
+    bool query_same = false;
+    if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+       a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+        truth_same = true;
+    }
+    else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+            a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+                truth_same = true;
+            }
+    if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+       a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+        query_same = true;
+    }
+    else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+            a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+                query_same = true;
+            }
+    if(truth_same && query_same) return true;
+    return false;
+bool IsRemovable(SequencePath & s){ return s.removable;}
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+    //dout << "===========start converge===================" << endl;
+    int path_num = path_list.size();
+    if(path_num <= 1) return;
+    for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+        SequencePath  ref_path = *i;
+        if(ref_path.removable) continue;
+        if(!ref_path.same_donor_len) continue;
+        list<SequencePath>::iterator j = i;
+        ++j;
+        for(; j != path_list.end(); ++j){
+            SequencePath que_path = *j;
+            if(que_path.removable) continue;
+            if(!que_path.same_donor_len) continue;
+            //dout << "Comparing following paths: " << endl;
+            //PrintPath(ref_path);
+            //PrintPath(que_path);
+            if(DonorLengthEqual(ref_path, que_path)){
+                if(ref_path.score >= que_path.score){
+                    (*j).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*j));
+                }else{
+                    (*i).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*i));
+                    break;
+                }
+            }
+            //dout << "-    -     -   -   -   -   -  - - -" << endl;
+        }
+    }
+    path_list.remove_if(IsRemovable);
+int WholeGenome::test() {
+	genome_sequences[0] = "GTCAGCCGG";
+	DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+	DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+	DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+	DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+	DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+	DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+	DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+	DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+    //complex_match_records = new vector<string>*[1];
+    //complex_match_records[0] = new vector<string>;
+	//vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+	vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+	//cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+	//cout << complex_match_records[0]->at(0) << endl;
+	return 0;
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+	int start = 0;
+	int cluster_number = variants_by_cluster.size(); // cluster number
+	int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+	int cluster_step = cluster_number / thread_num; // assign clusters to threads
+	if (cluster_step * thread_num < cluster_number) cluster_step++;
+	int end = start + cluster_step;
+	//initialize vector size
+	//complex_match_records = new vector<string>*[thread_num];
+	match_records_by_mode_by_thread = new vector<string>**[thread_num];
+    quality_que_matchnum_by_thread_mode = new map<int, float> ** [thread_num];
+    //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+	for(int i = 0; i < thread_num; i++){
+        match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+        quality_que_matchnum_by_thread_mode[i] = new map<int, float>*[MATCH_MODE_NUM];
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            match_records_by_mode_by_thread[i][j] = new vector<string>;
+            quality_que_matchnum_by_thread_mode[i][j] = new map<int, float>;
+        }
+	}
+    baseline_total_match_num = new vector<int>* [thread_num];
+    query_total_match_num = new vector<int> * [thread_num];
+    for(int i = 0; i < thread_num; i++){
+        baseline_total_match_num[i] = new vector<int>;
+        baseline_total_match_num[i]->resize(MATCH_MODE_NUM, 0);
+        query_total_match_num[i] = new vector<int>;
+        query_total_match_num[i]->resize(MATCH_MODE_NUM, 0);
+    }
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+		threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+		start = end;
+		end = start + cluster_step;
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	if (start >= variants_by_cluster.size()) {
+		dout << "[Error] index out of map range" << endl;
+	}
+	else {
+		ClusteringMatchInThread(start, end, i);
+	}
+	// call join() on each thread in turn before this function?
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    //output all results
+    ofstream output_stat_file;
+    output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+    cout << "=========VarMatch Result Stat.=======" << endl;
+    string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tbaseline_match_num\tquery_match_num";
+    cout << stat_head_string << endl;
+    output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+    output_stat_file << "##Query:"<< query_variant_total_num << endl;
+    output_stat_file << stat_head_string << endl;
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                int total_ref_complex = 0;
+                int total_que_complex = 0;
+                int baseline_match_num_by_threshold_by_mode = 0;
+                int query_match_num_by_threshold_by_mode = 0;
+                for(int i = 0; i < thread_num; i++){
+                    baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i]->at(mode_index);
+                    query_match_num_by_threshold_by_mode += query_total_match_num[i]->at(mode_index);
+                }
+                string baseline_match_num_string = to_string(baseline_match_num_by_threshold_by_mode);
+                string query_match_num_string = to_string(query_match_num_by_threshold_by_mode);
+                string total_match_num_string = to_string(score_unit) + "\t" +
+                                                to_string(match_mode) + "\t" + 
+                                                to_string(score_scheme) + "\t" +
+                                                baseline_match_num_string + "\t" + 
+                                                query_match_num_string;// + "\t" + to_string(mode_index);
+                cout << total_match_num_string << endl;
+                output_stat_file << total_match_num_string << endl;
+            }
+        }
+    }
+    output_stat_file.close();
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+                ofstream output_complex_file;
+                output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+                output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+                output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+                output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+                for(int i = 0; i < thread_num; i++){
+                    for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+                        if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+                            output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+                        }
+                    }
+                }
+                output_complex_file.close();
+            }
+        }
+    }
+    map<int, float> query_qual_matchnum[MATCH_MODE_NUM];
+    for(int i  = 0; i < mode_index_list.size(); i++){
+        int mode_index = mode_index_list[i];
+        for(int t = 0; t < thread_num; t++){
+            auto matchmap_pointer = quality_que_matchnum_by_thread_mode[t][mode_index];
+            for(auto it = matchmap_pointer->begin(); it != matchmap_pointer->end(); ++it){
+                int qual = it->first;
+                if(query_qual_matchnum[mode_index].find(qual) != query_qual_matchnum[mode_index].end()){
+                    query_qual_matchnum[mode_index][qual] += 1.0;
+                }else{
+                    query_qual_matchnum[mode_index][qual] = 1.0;
+                }
+            }
+        }
+    }
+    map<int, float> query_qual_accumulated_totalnum;
+    for(int i = 0; i < mode_index_list.size(); i++){
+        int mode_index = mode_index_list[i];
+        for(auto it = query_qual_totalnum.begin(); it!= query_qual_totalnum.end(); ++it){
+            map<int, float> query_qual_accumulated_matchnum;
+        }
+    }
+    map<int, float> roc_xy [MATCH_MODE_NUM];
+    ofstream output_roc_file;
+    output_roc_file.open(output_dir + "/" + output_prefix+".roc");
+    output_roc_file.close();
+    // clear all matching records
+	for(int i = 0; i < thread_num; i++){
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            delete match_records_by_mode_by_thread[i][j];
+            delete quality_que_matchnum_by_thread_mode[i][j];
+        }
+        delete[] match_records_by_mode_by_thread[i];
+        delete[] quality_que_matchnum_by_thread_mode[i];
+        delete baseline_total_match_num[i];
+        delete query_total_match_num[i];
+	}
+	delete[] match_records_by_mode_by_thread;
+    delete[] quality_que_matchnum_by_thread_mode;
+    delete[] baseline_total_match_num;
+    delete[] query_total_match_num;
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+	int left_index = pos;
+	if (genome_sequences[chr_id].size() == 0) return -1;
+	if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+			if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+				parsimonious_ref.pop_back();
+				parsimonious_alt0.pop_back();
+				parsimonious_alt1.pop_back();
+				change_in_allels = true;
+			}
+            // else do not make further changes
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+			left_index--;
+			char left_char = toupper(genome_sequences[chr_id][left_index]);
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt0 = left_char + parsimonious_alt0;
+			parsimonious_alt1 = left_char + parsimonious_alt1;
+		}
+	}
+	while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+            parsimonious_ref[0] == parsimonious_alt1[0] &&
+            parsimonious_ref.size() > 1 &&
+            parsimonious_alt0.size() > 1 &&
+            parsimonious_alt1.size() > 1)
+    {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt0.erase(0, 1);
+		parsimonious_alt1.erase(0, 1);
+        left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+	}
+	return left_index;
+void WholeGenome::SingleThreadClustering(int chr_id) {
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = 0;
+	int c_end = 0;
+    sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+    sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+    int ref_size = ref_variant_by_chrid[chr_id]->size();
+    int que_size = que_variant_by_chrid[chr_id]->size();
+    //dout << chr_id << "," << ref_size << "," << que_size << endl;
+    int ref_index = 0;
+    int que_index = 0;
+    bool not_first = false;
+    DiploidVariant snp;
+    vector<VariantIndicator> vi_list;
+    while (ref_index < ref_size || que_index < que_size) {
+		bool take_que = true;
+		if(ref_index < ref_size && que_index < que_size){
+            if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+                take_que = false;
+            }
+		}else if(ref_index < ref_size){
+            take_que = false;
+		}
+        int var_index;
+		if(take_que){
+            snp = que_variant_by_chrid[chr_id]->at(que_index);
+            //cout << "q |" << que_index << "," << snp.pos << endl;
+            var_index = que_index;
+            que_index++;
+		}else{
+            snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+            //cout << "r |" << ref_index << "," << snp.pos << endl;
+            var_index = ref_index;
+            ref_index++;
+		}
+		// check if need to separator clusters
+		if (not_first) {
+			c_end = snp.pos;
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+                    vi_list.clear();
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_start = 0; // re-assign c_start
+				}
+			}
+		}
+		c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+        VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+        vi_list.push_back(current_variant_indicator);
+		//cluster_vars_map[cluster_index].push_back(snp);
+		if(!not_first) not_first = true;
+		int ref_length = (int)(snp.ref.length());
+		int flag = 0;
+        if(snp.flag) flag = 1;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        ins_len[flag] += snp.mil;
+        del_len[flag] += snp.mdl;
+	}
+int WholeGenome::ReadReferenceVariants(string filename){
+    return ReadWholeGenomeVariant(filename, false);
+int WholeGenome::ReadQueryVariants(string filename){
+    return ReadWholeGenomeVariant(filename, true);
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+    ReadWholeGenomeSequence(genome_seq);
+    baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+    ref_vcf_filename = ref_vcf;
+void WholeGenome::Compare(string query_vcf,
+	string output_prefix,
+    bool detail_results)
+    // initialize query variant data structure
+    que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        que_variant_by_chrid[j] = new vector<DiploidVariant>;
+    }
+    if(score_scheme_indicator == 3){
+        DirectMatch(ref_vcf_filename, query_vcf);
+        return;
+    }
+	que_vcf_filename = query_vcf;
+    this->output_prefix = output_prefix;
+    this->detail_results = detail_results;
+    query_variant_total_num = ReadQueryVariants(query_vcf);
+    cout << "Baseline VCF: " << ref_vcf_filename << endl;
+    cout << "Query VCF: " << query_vcf << endl;
+    cout << "========VCF Stat.==========" << endl;
+    cout << "Total Number of VCF Entries: " << endl;
+    cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+    ParallelClustering();
+    ClusteringMatchMultiThread();
+    // most clustering results are cleared inside ParallelClustering function except the following one
+    // which is needed for matching
+    variants_by_cluster.clear();
+    // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        que_variant_by_chrid[j]->clear();
+        delete que_variant_by_chrid[j];
+    }
+    delete[] que_variant_by_chrid;
+    query_variant_strings.clear();
+    query_variant_total_num = 0;
+    quality_que_totalnum.clear();
+    // The following three matching results are cleared inside ClusteringMatchMultiThread function
+    // match_records_by_mode_by_thread;
+    // baseline_total_match_num;
+    // query_total_match_num;
+    return;
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf)
+    //dout << "direct match" << endl;
+    int ref_variant_num = ReadReferenceVariants(ref_vcf);
+    int que_variant_num = ReadQueryVariants(query_vcf);
+    dout << ref_variant_num << "," << que_variant_num << endl;
+    int match_num = 0;
+    for(int i = 0; i < chrom_num; i++){
+        if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+            continue;
+        //[TODO] not the right way to do it, at least need multimap
+        multimap<int, int> ref_variant_by_pos;
+        for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            ref_variant_by_pos.insert(pair<int, int>(pos, j));
+        }
+        for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = que_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+                continue;
+            pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+            var_range = ref_variant_by_pos.equal_range(pos);
+            for(auto it = var_range.first; it != var_range.second; ++it){
+                int ref_index = (*it).second;
+                DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+                if (match_mode_indicator != 1 && var == ref_var){
+                    match_num ++;
+                    break;
+                }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+                    match_num ++;
+                    break;
+                }
+            }
+        }
+    }
+    dout << "matched variants: " << match_num << endl;
diff --git a/src/wholegenome_backup.h b/src/wholegenome_backup.h
new file mode 100644
index 0000000..8746e48
--- /dev/null
+++ b/src/wholegenome_backup.h
@@ -0,0 +1,274 @@
+#pragma once
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+typedef struct VariantIndicator{
+    VariantIndicator(int chr_id_ = -1,
+    int var_id_ = -1,
+    bool refer_ = true) :
+    chr_id(chr_id_),
+    var_id(var_id_),
+    refer(refer_){}
+    char chr_id;
+    int var_id;
+    bool refer;
+typedef struct Interval {
+    int start;
+    int end;
+    Interval() : start(0), end(0) {}
+    Interval(int s, int e) : start(s), end(e) {}
+class SequencePath{
+    SequencePath(int n)
+    {
+        reference_length = n;
+        for(int i = 0; i < 4; i++){
+            string_sequences[i].resize(n, ".");
+            // default value is "."
+            donor_sequences[i] = "";
+        }
+        current_genome_pos = -1;
+        score = 0;
+        removable = false;
+        same_donor_len = false;
+        current_equal_donor_pos[0] = -1;
+        current_equal_donor_pos[1] = -1;
+        reached_sync_num = 0;
+    }
+    int reference_length;
+    vector<string> string_sequences[4];
+    map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+    // one choice is a pair: variant id, phasing index
+    int current_genome_pos;
+    string donor_sequences[4];
+    int current_equal_donor_pos[2];
+    int score;
+    bool removable;
+    bool same_donor_len;
+    int reached_sync_num;
+class WholeGenome{
+    int chrom_num;
+    int thread_num;
+    string ref_vcf_filename;
+    string que_vcf_filename;
+    int baseline_variant_total_num;
+    int query_variant_total_num;
+    vector<string> baseline_variant_strings;
+    vector<string> query_variant_strings;
+    bool detail_results;
+    //int thread_num; VCF->DiploidVariant->WholeGenome
+    map<string, int> chrid_by_chrname;
+    map<int, string> chrname_by_chrid;
+    map<string, int> chrname_dict;
+    map<int, string> genome_sequences;
+    vector<DiploidVariant> ** ref_variant_by_chrid;
+    vector<DiploidVariant> ** que_variant_by_chrid;
+    vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+    // so here cluster is represented as vector<vector<VariantIndicator>>
+    // and we create a list of pointers point to cluster
+    // and we hold the point to that list
+    vector<vector<VariantIndicator>> variants_by_cluster;
+    vector<string> *** match_records_by_mode_by_thread;
+    //vector<int> *** baseline_matches_by_mode_by_thread;
+    //vector<int> *** query_matches_by_mode_by_thread;
+    vector<int> ** baseline_total_match_num;
+    vector<int> ** query_total_match_num;
+    map<int, float> *** quality_que_matchnum_by_thread_mode;
+    //map<float, int> *** tp_qual_num_by_mode_by_thread;
+    //map<float, int> *** fp_qual_num_by_mode_by_thread;
+    //map<float, int> query_total_qual_num;
+    string output_prefix;
+    string output_dir;
+    // copy the above into this.
+    int score_unit_indicator;
+    int match_mode_indicator;
+    int score_scheme_indicator;
+    vector<int> score_unit_list;
+    vector<int> match_mode_list;
+    vector<int> score_scheme_list;
+    vector<int> mode_index_list;
+    map<int, float> quality_que_totalnum;
+    bool ReadWholeGenomeSequence(string filename);
+    bool ReadGenomeSequenceList(string filename);
+    int ReadWholeGenomeVariant(string filename, bool flag);
+    bool ReadVariantFileList(string filename);
+    int ReadReferenceVariants(string filename);
+    int ReadQueryVariants(string filename);
+    bool ParallelClustering(); // parallel by chr id
+    bool ParallelMatching(); // parallel by task
+    bool TBBMatching();
+    void SingleThreadClustering(int chr_id);
+    //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+    //override
+    bool ClusteringMatchInThread(int start, int end, int thread_index);
+    void ClusteringMatchMultiThread();
+    int NormalizeVariantSequence(int pos,
+                             string & parsimonious_ref,
+                             string & parsimonious_alt0,
+                             string & parsimonious_alt1,
+                             int chr_id);
+    struct compInterval {
+        bool operator()(const Interval &a, const Interval &b) const {
+            return a.start<b.start;
+        }
+    };
+    vector<Interval> merge(vector<Interval> &intervals) {
+        sort(intervals.begin(),intervals.end(),compInterval());
+        vector<Interval> results;
+        for(int i=0; i<intervals.size(); i++) {
+            if(results.empty() || results.back().end < intervals[i].start)  // no overlap
+                results.push_back(intervals[i]);
+            else   // overlap
+                results.back().end = max(results.back().end, intervals[i].end);
+        }
+        return results;
+    }
+    bool PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+    int PathExtendOneStep(SequencePath& sp,
+                          multimap<int, int> * choices_by_pos[],
+                          const string & reference_sequence,
+                          vector<int> & sync_points,
+                          int match_mode);
+    bool PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    bool MatchingSingleClusterBaseExtending(int cluster_index,
+                                            int thread_index,
+                                            vector<DiploidVariant> & variant_list,
+                                            string & subsequence,
+                                            int offset,
+                                            multimap<int, int> * choices_by_pos[],
+                                            vector<int> & sync_points,
+                                            int chr_id,
+                                            int score_unit,
+                                            int match_mode,
+                                            int score_scheme);
+    bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+    void ConvergePaths(list<SequencePath> & path_list);
+    int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+    int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+    int EditDistance(const std::string& s1, const std::string& s2);
+    bool PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    void ConstructMatchRecord(SequencePath & best_path,
+                               vector<DiploidVariant> & variant_list,
+                               string & subsequence,
+                               int offset,
+                               int thread_index,
+                               int chr_id,
+                               int mode_index);
+    void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index);
+    int CalculateScore(DiploidVariant & dv,
+                       int choice,
+                       int score_unit,
+                       int match_mode,
+                       int score_scheme);
+    int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+    bool ClearQuery();
+    inline void ToUpper(string & s){
+        transform(s.begin(), s.end(), s.begin(), ::toupper);
+    }
+    bool CheckTandemRepeat(string sequence, int unit_threshold);
+    bool MatchVariantListInThread(int thread_index,
+        int chr_id,
+        vector<DiploidVariant> & variant_list,
+        int cluster_id);
+    WholeGenome(int thread_num_,
+                int score_unit_,
+                int match_mode_,
+                int score_scheme_,
+                string output_dir_);
+    ~WholeGenome();
+    void ReadRef(string genome_seq, 
+      string ref_vcf);
+    void Compare(string query_vcf,
+        string output_prefix,
+        bool detail_results);
+    void DirectMatch(string ref_vcf,
+                string query_vcf);
+    int test(); // for direct test
+    void PrintPath(SequencePath & sp);
+    const static int MATCH_MODE_NUM = 16;
+    const static int VAR_LEN = 100;
+    const static int MAX_REPEAT_LEN = 1000;
+    const static int ROC_SAMPLE_NUM = 5;
diff --git a/src/wholegenome_working.cpp b/src/wholegenome_working.cpp
new file mode 100644
index 0000000..b796b17
--- /dev/null
+++ b/src/wholegenome_working.cpp
@@ -0,0 +1,2471 @@
+#include "wholegenome.h"
+using namespace std;
+WholeGenome::WholeGenome(int thread_num_,
+    string output_dir_,
+    bool pr_curves){
+    thread_num = thread_num_;
+    chrom_num = 24;
+    output_dir = output_dir_;
+    //thread_num = thread_num_;
+    //dout << "WholeGenome() Thread Number: " << thread_num << endl;
+    ref_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+	for (int j = 0; j < chrom_num; j++) {
+		ref_variant_by_chrid[j] = new vector<DiploidVariant>;
+	}
+    que_variant_by_chrid = new vector<DiploidVariant>*[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        que_variant_by_chrid[j] = new vector<DiploidVariant>;
+    }
+    // chr_id starts from 0
+	for(int j = 1; j <= 22; j++){
+        string chr_name = to_string(j);
+        chrname_dict[chr_name] = j-1;
+        chr_name = "chr"+chr_name;
+        chrname_dict[chr_name] = j-1;
+	}
+	chrname_dict["X"] = 22;
+	chrname_dict["chrX"] = 22;
+	chrname_dict["Y"] = 23;
+	chrname_dict["chrY"] = 23;
+    if(pr_curves){
+        per_list = {0.0, 0.1, 0.2, 0.3, 0.9};
+    }else{
+        per_list = {0.0};
+    }
+inline int WholeGenome::GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme){
+    int result = 0;
+    result |= score_unit & 1;
+    result <<= 1;
+    result |= match_mode & 1;
+    result <<= 2;
+    result |= score_scheme & 3;
+    return result;
+    for(int j = 0; j < chrom_num; j++){
+        ref_variant_by_chrid[j]->clear();
+        delete ref_variant_by_chrid[j];
+        que_variant_by_chrid[j]->clear();
+        delete que_variant_by_chrid[j];
+    }
+    delete[] ref_variant_by_chrid;
+    delete[] que_variant_by_chrid;
+bool WholeGenome::ReadWholeGenomeSequence(string filename){
+    std::ifstream input(filename);
+    if(!input.good()){
+        std::cerr << "Error opening '"<<filename<<"'. Bailing out." << std::endl;
+        return false;
+    }
+    std::string line, name, content;
+    int real_chrom_num = 0;
+    int chr_id = 0;
+    int current_id = -1;
+    while( std::getline( input, line ).good() ){
+        if( line.empty() || line[0] == '>' ){ // Identifier marker
+            if( !name.empty() ){ // Print out what we read from the last entry
+                //std::cout << name << " : " << content << std::endl;
+                if(chrname_dict.find(name) == chrname_dict.end()){
+                    cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+                    return false;
+                }
+                //int chr_id = chrname_dict[name];
+                if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+                    chrid_by_chrname[name] = chr_id;
+                    chr_id++;
+                }
+                current_id = chrid_by_chrname[name];
+                chrname_by_chrid[current_id] = name;
+                genome_sequences[current_id] = content;
+                real_chrom_num++;
+                name.clear();
+            }
+            if( !line.empty() ){
+                name = split(line, ' ')[0].substr(1);
+            }
+            content.clear();
+        } else if( !name.empty() ){
+            if( line.find(' ') != std::string::npos ){ // Invalid sequence--no spaces allowed
+                name.clear();
+                content.clear();
+            } else {
+                content += line;
+            }
+        }
+    }
+    if( !name.empty() ){ // Print out what we read from the last entry
+        //std::cout << name << " : " << content << std::endl;
+        if(chrname_dict.find(name) == chrname_dict.end()){
+            cout << "[VarMatch] Error: detected chromosome name: " << name <<" does not exist in human genome." << endl;
+            return false;
+        }
+        if(chrid_by_chrname.find(name) == chrid_by_chrname.end()){
+            chrid_by_chrname[name] = chr_id;
+            chr_id++;
+        }
+        current_id = chrid_by_chrname[name];
+        chrname_by_chrid[current_id] = name;
+        genome_sequences[current_id] = content;
+        real_chrom_num++;
+    }
+    // test
+    chrom_num = real_chrom_num;
+    //dout << "detected chromosome num: " << chrom_num << endl;
+//    for(auto it = genome_sequences.begin(); it != genome_sequences.end(); ++it){
+//        cout << it->first << ":" << (it->second).length();
+//    }
+    return true;
+bool WholeGenome::ReadGenomeSequenceList(string filename){
+int WholeGenome::ReadWholeGenomeVariant(string filename, bool flag){
+    int total_num = 0;
+    int long_num = 0;
+    double QUAL_LOWER_BOUND = 0.1;
+	ifstream vcf_file;
+	vcf_file.open(filename.c_str());
+	if (!vcf_file.good()) {
+		cout << "[VarMatch] Error: can not open vcf file" << endl;
+		return -1;
+	}
+    vector<float> quality_list;
+	int genotype_index = -1;
+	char genotype_separator = '/';
+	//int genome_sequence_length = genome_sequence.length();
+	while (!vcf_file.eof()) { // alternative way is vcf_file != NULL
+		string line;
+		getline(vcf_file, line, '\n');
+		// check ineligible lines
+		//dout << line << endl;
+		if ((int)line.length() <= 1) continue;
+		//if (line.find_first_not_of(' ') == std::string::npos) continue;
+		if (line[0] == '#') {
+			continue;
+		}
+		auto columns = split(line, '\t');
+		if (columns.size() < 10) {
+			if(match_mode_indicator != 1){
+                cout << "[VarMatch] Warning: not enough information in VCF file for genotype matching." << endl;
+                cout << "[VarMatch] \tAutomatically turn off genotype matching module " << filename << endl;
+                match_mode_indicator = 1;
+                //continue;
+            }
+            if(columns.size() < 6){
+                cout << "[VarMatch] Warning: not enough information in VCF file for variant matching." << endl;
+                cout << "[VarMatch] skip current variant: " << line << endl;
+                continue;
+            }
+		}
+		string chr_name = columns[0];
+		auto pos = atoi(columns[1].c_str()) - 1; // 0-based coordinate
+		auto ref = columns[3];
+		auto alt_line = columns[4];
+		double quality = stod(columns[5]);
+        if(flag){
+            quality_list.push_back(quality);
+        }
+		ToUpper(ref);
+		ToUpper(alt_line);
+		bool is_heterozygous_variant = false;
+		bool is_multi_alternatives = false;
+		if (match_mode_indicator != 1) { // match mode indicator is -1 or 0
+			if (genotype_index < 0) {
+                // change genotype index
+                auto formats = split(columns[8], ':');
+                for (int i = 0; i < formats.size(); i++) {
+                    if (formats[i] == "GT") {
+                        genotype_index = i;
+                        break;
+                    }
+                }
+                // if GT not found
+                if(genotype_index < 0){
+                    if(match_mode_indicator != 1){
+                        cout << "[VarMatch] Warning: VCF entry does not contain genotype information." << endl;
+                        cout << "[VarMatch] \tAutomatically turn off genotype matching mode. " << endl;
+                        match_mode_indicator = 1;
+                    }
+                }
+			}
+            if(match_mode_indicator != 1){
+    			auto additionals = split(columns[9], ':');
+                vector<string> genotype_columns = split(additionals[genotype_index], genotype_separator);
+                if(genotype_columns.size() != 2){
+                    if(genotype_separator == '/'){
+                        genotype_separator = '|';
+                    }else{
+                        genotype_separator = '/';
+                    }
+                    genotype_columns = split(additionals[genotype_index], genotype_separator);
+                }
+    			// normalize format of genotype: sorted, separated by |
+    			if (genotype_columns.size() != 2) {
+    				cout << "[VarMatch] Warning: Unrecognized Genotype: " << additionals[genotype_index] << endl;
+                    cout << "[VarMatch] \tAutomatically turn off genotype matching mode." << endl;
+                    match_mode_indicator = 1;
+    			}
+    			else {
+    				if (genotype_columns[0] != genotype_columns[1]) {
+    					is_heterozygous_variant = true;
+    				}
+                    if (genotype_columns[1] == "0" && genotype_columns[0] == "0") {
+                        //cout << "Skip Variants when both genotype is refernce allele: " << line << endl;
+                        continue;
+                    }
+    			}
+            }
+		}
+		vector<string> alt_list;
+		if (alt_line.find(",") != std::string::npos) {
+			alt_list = split(alt_line, ',');
+			is_multi_alternatives = true;
+		}
+		else {
+			alt_list.push_back(alt_line);
+		}
+        int snp_ins = max(0, (int)alt_list[0].length() - (int)ref.length());
+        int snp_del = max(0, (int)ref.length() - (int)alt_list[0].length());
+        if(is_multi_alternatives){
+            snp_ins = max(snp_ins, (int)alt_list[1].length() - (int)ref.length());
+            snp_del = max(snp_del, (int)ref.length() - (int)alt_list[1].length());
+        }
+        if(snp_ins > VAR_LEN || snp_del > VAR_LEN){
+            //dout << "[VarMatch] skip large INDEL with length > " << VAR_LEN << "| "<< line <<endl;
+            long_num ++;
+            continue;
+        }
+		DiploidVariant dv(pos, ref, alt_list, is_heterozygous_variant, is_multi_alternatives, snp_del, snp_ins, flag, quality);
+		//if (normalization) {
+			//NormalizeDiploidVariant(dv);
+		//}
+        if(chrid_by_chrname.find(chr_name) != chrid_by_chrname.end()){
+            int chr_id = chrid_by_chrname[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }else{
+            cout << "[VarMatch] skip current variant as no corresponding reference genome sequence found." << endl;
+            continue;
+            int chr_id = chrname_dict[chr_name];
+            if(flag == false){
+                ref_variant_by_chrid[chr_id]->push_back(dv);
+                //baseline_variant_strings.push_back(line);
+            }else{
+                que_variant_by_chrid[chr_id]->push_back(dv);
+                query_variant_strings.push_back(line);
+            }
+        }
+        total_num++;
+	}
+	vcf_file.close();
+    if(flag){
+        sort(quality_list.begin(), quality_list.end());
+        auto qual_lower_it = lower_bound(quality_list.begin(), quality_list.end(), QUAL_LOWER_BOUND);
+        int qual_lower_index = qual_lower_it - quality_list.begin();
+        int rest_size = quality_list.size() - qual_lower_index;
+        vector<float> temp_percentage_list;
+        temp_percentage_list.push_back(0.0);
+        threshold_list.push_back(0.0);
+        for(int i = 1; i < per_list.size(); i++){
+            int additional_index = (int)(rest_size * per_list[i]);
+            int real_index = qual_lower_index + additional_index;
+            if(real_index >= quality_list.size()) real_index = quality_list.size() - 1;
+            double quality = quality_list[real_index];
+            threshold_list.push_back(quality);
+            auto quality_lowit = lower_bound(quality_list.begin(), quality_list.end(), quality);
+            int quality_low_index = quality_lowit - quality_list.begin();
+            // following program will retain variants >= quality threshold
+            int quality_size = quality_low_index + 1; // counting number, +/- 1 does not matter
+            if(quality_size > quality_list.size()) quality_size = quality_list.size();
+            double percentage = (double)quality_size/ quality_list.size();
+            temp_percentage_list.push_back(percentage);
+        }
+        threshold_num = threshold_list.size();
+        // revice percentage
+        per_list = temp_percentage_list;
+    }
+    cout << flag << "," << total_num << "," << long_num << endl;
+	return total_num;
+bool WholeGenome::ReadVariantFileList(string filename){
+int WholeGenome::ScoreEditDistance(DiploidVariant & dv, int allele_indicator){
+    return EditDistance(dv.ref, dv.alts[allele_indicator]);
+inline int WholeGenome::EditDistance(const std::string& s1, const std::string& s2)
+	const std::size_t len1 = s1.size(), len2 = s2.size();
+	std::vector<unsigned int> col(len2+1), prevCol(len2+1);
+	for (unsigned int i = 0; i < prevCol.size(); i++)
+		prevCol[i] = i;
+	for (unsigned int i = 0; i < len1; i++) {
+		col[0] = i+1;
+		for (unsigned int j = 0; j < len2; j++)
+                        // note that std::min({arg1, arg2, arg3}) works only in C++11,
+                        // for C++98 use std::min(std::min(arg1, arg2), arg3)
+			col[j+1] = std::min({ prevCol[1 + j] + 1, col[j] + 1, prevCol[j] + (s1[i]==s2[j] ? 0 : 1) });
+		col.swap(prevCol);
+	}
+	return prevCol[len2];
+// Needleman Wunsch Initialization
+inline void WholeGenome::initialize_score_matrix(int **score, char **trackBack, int M, int N)
+    for (int i = 0; i < M+1; i++)
+    {
+        score[0][i] = i * -1;
+        trackBack[0][i] = '-';
+    }
+    for (int i = 0; i < N+1; i++)
+    {
+        score[i][0] = i * -1;
+        trackBack[i][0] = '|';
+    }
+    trackBack[0][0] = '*';
+int WholeGenome::needleman_wunsch(string S1, string S2, string &R1, string &R2)
+    int M = S1.length();
+    int N = S2.length();
+    /*
+    N
+    N
+    N
+    N
+    N
+    N
+    so the matrix is N*M
+    */
+    int **score = new int *[N+1];
+    for (int i = 0; i <= N; i++)
+    {
+        score[i] = new int [M+1];
+    }
+    char **trackBack = new char *[N+1];
+    // * for match, - for ->, | for moving downward
+    for (int i = 0; i <= N; i++)
+    {
+        trackBack[i] = new char [M+1];
+    }
+    R1 = "";
+    R2 = "";
+    initialize_score_matrix(score, trackBack, M, N);
+    for (int i = 1; i <=N; i++)
+    {
+        for (int k = 1; k <= M; k++)
+        {
+            char S1_k = S1[k-1];
+            char S2_i = S2[i-1];
+            int matchingCost = score[i-1][k-1];
+            if(S1_k != S2_i) matchingCost--;
+            int rightCost = score[i][k-1] - 1;
+            int downCost = score[i-1][k] - 1;
+            if (matchingCost > rightCost && matchingCost > downCost)
+            {
+                score[i][k] = matchingCost;
+                trackBack[i][k] = '*';
+            }else if(rightCost >= downCost)
+            {
+                score[i][k] = rightCost;
+                trackBack[i][k] = '-';
+            }else
+            {
+                score[i][k] = downCost;
+                trackBack[i][k] = '|';
+            }
+        }
+    }
+    //trackBack
+    int n = N;
+    int m = M;
+    while(n > 0 || m > 0)
+    {
+        if (trackBack[n][m] == '*')
+        {
+            R1 += S1[m-1];
+            R2 += S2[n-1];
+            n--;
+            m--;
+        }else if(trackBack[n][m] == '-')
+        {
+            R1 += S1[m-1];
+            R2 += '-';
+            m--;
+        }else if(trackBack[n][m] == '|')
+        {
+            R1 += '-';
+            R2 += S2[n-1];
+            n--;
+        }
+    }
+    reverse(R1.begin(), R1.end());
+    reverse(R2.begin(), R2.end());
+    return score[N][M];
+void WholeGenome::GenerateAltVector(string ref, string alt, vector<string> & alt_vector){
+    if(ref.size() == 0) return;
+    string ref_match = "";
+    string alt_match = "";
+    needleman_wunsch(ref, alt, ref_match, alt_match);
+    int current_ref_index = -1;
+    for(int i = 0; i < ref.size(); i++){
+        alt_vector.push_back("");
+    }
+    for(int i = 0; i < ref_match.size(); i++){
+        if(ref_match[i] == '-'){
+            if(current_ref_index < 0){
+                alt_vector[0].push_back(alt_match[i]);
+            }else{
+                alt_vector[current_ref_index].push_back(alt_match[i]);
+            }
+        }else if(alt_match[i] == '-'){
+            // pass
+            current_ref_index ++;
+        }else{
+            current_ref_index ++;
+            if(current_ref_index >= ref.size()){
+                alt_vector[ref.size()-1].push_back(alt_match[i]);
+            }
+            alt_vector[current_ref_index].push_back(alt_match[i]);
+        }
+    }
+    return;
+bool WholeGenome::ParallelClustering(){
+    // parallel by chr
+    variant_cluster_by_chrid = new vector<vector<VariantIndicator>> *[chrom_num];
+    for (int j = 0; j < chrom_num; j++) {
+        variant_cluster_by_chrid[j] = new vector<vector<VariantIndicator>>;
+    }
+    int parallel_steps = chrom_num / thread_num;
+    if(parallel_steps*thread_num < chrom_num) parallel_steps += 1;
+    int chr_id = 0;
+    for(int i = 0; i < parallel_steps; i++){
+        vector<thread> threads;
+        for(int j = 0; j < thread_num-1 && chr_id < chrom_num-1; j++){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                if(ref_variant_by_chrid[chr_id]->size() > 0 && que_variant_by_chrid[chr_id]->size() > 0){
+                    threads.push_back(thread(&WholeGenome::SingleThreadClustering, this, chr_id));
+                }
+            }
+            chr_id ++;
+        }
+        if(chr_id < chrom_num){
+            if(chrname_by_chrid.find(chr_id) != chrname_by_chrid.end()){
+                SingleThreadClustering(chr_id);
+            }
+            chr_id ++;
+        }
+        std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+        threads.clear();
+    }
+    for(int i = 0; i < chrom_num; i++){
+        if(variant_cluster_by_chrid[i]->size() > 0){
+            variants_by_cluster.insert(variants_by_cluster.end(), variant_cluster_by_chrid[i]->begin(), variant_cluster_by_chrid[i]->end());
+        }
+    }
+    // test output
+    //dout << endl;
+    map<int, int> size_num;
+    map<int, int> size_chrid;
+    for(int i = 0; i < chrom_num; i++){
+        //dout << i << ": " << variant_cluster_by_chrid[i]->size() << endl;
+        for(int j = 0; j < variant_cluster_by_chrid[i]->size(); j++){
+            int temp_size = variant_cluster_by_chrid[i]->at(j).size();
+            if(size_num.find(temp_size) != size_num.end()){
+                size_num[temp_size] ++;
+            }else{
+                size_num[temp_size] = 1;
+            }
+            if(size_chrid.find(temp_size) == size_chrid.end()){
+                size_chrid[temp_size] = i;
+            }
+        }
+    }
+    //cout << endl;
+    //for(auto it = size_num.begin(); it != size_num.end(); ++it){
+    //    dout << it->first << ": " << it->second << endl;
+    //}
+//    cout << endl;
+//    cout << "size and location:" << endl;
+//    for(auto it = size_chrid.begin(); it != size_chrid.end(); ++it){
+//        dout << it->first << ": " << it->second << endl;
+//    }
+        // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        variant_cluster_by_chrid[j]->clear();
+        delete variant_cluster_by_chrid[j];
+    }
+    delete[] variant_cluster_by_chrid;
+    return true;
+bool WholeGenome::ParallelMatching(){
+bool WholeGenome::TBBMatching()
+bool WholeGenome::CheckTandemRepeat(string sequence, int unit_threshold) {
+    int sequence_length = (int)sequence.length();
+    //cout << sequence_length << "," << unit_threshold << endl;
+    if(sequence_length == 1) return true;
+    transform(sequence.begin(), sequence.end(), sequence.begin(), ::toupper);
+    int end_index = sequence_length / 2 + 1;
+    bool final_checking = false;
+    int repeat_threshold = min(end_index-1, unit_threshold);
+    for (int repeat_length = 1; repeat_length <= end_index; repeat_length++) {
+        bool is_tandem_repeat = true;
+        int repeat_time = 1;
+        string repeat_region = sequence.substr(0, repeat_length);
+        int start_position = repeat_length;
+        while (start_position < sequence_length) {
+            if (start_position + repeat_length > sequence_length)
+                break;
+            string matching_region = sequence.substr(start_position, repeat_length);
+            if (matching_region != repeat_region) {
+                is_tandem_repeat = false;
+                break;
+            }
+            start_position += repeat_length;
+            repeat_time ++;
+        }
+        if (is_tandem_repeat && repeat_time > 1) {
+            final_checking = true;
+            break;
+        }
+    }
+    return final_checking;
+bool WholeGenome::MatchVariantListInThread(int thread_index, 
+    int threshold_index,
+    int chr_id,
+    vector<DiploidVariant> & variant_list,
+    int cluster_id){
+    //===================================================
+    sort(variant_list.begin(), variant_list.end());
+    // decide reference sequence
+    vector<DiploidVariant> separate_var_list[2];
+    vector<Interval> intervals;
+    // separate into ref and que
+    int total_mil = 0;
+    int total_mdl = 0;
+    int min_pos = genome_sequences[chr_id].length() + 1;
+    int max_pos = -1;
+    for (int i = 0; i < variant_list.size(); i++) {
+        int flag = 0;
+        if (variant_list[i].flag) flag = 1; // flag indicate if the variant is from ref set(0) or query set(1)
+        int pos = variant_list[i].pos;
+        separate_var_list[flag].push_back(variant_list[i]);
+        total_mil += variant_list[i].mil;
+        total_mdl += variant_list[i].mdl;
+        auto ref_sequence = variant_list[i].ref;
+        auto alt_sequences = variant_list[i].alts;
+        min_pos = min(pos, min_pos);
+        max_pos = max((int)(pos + ref_sequence.length()), max_pos);
+        int end_pos = pos + ref_sequence.length() - 1; // included end position!!
+        intervals.push_back(Interval(pos, end_pos));
+    }
+    min_pos = max(min_pos - 1, 0);
+    max_pos = min(max_pos + 1, (int)genome_sequences[chr_id].length()); //exclusive
+    if (separate_var_list[0].size() == 0 || separate_var_list[1].size() == 0) {
+        //dout << separate_var_list[0].size() << ", " << separate_var_list[1].size() << endl;
+        return false;
+    }
+    if (separate_var_list[0].size() == 1 && separate_var_list[1].size() == 1){
+        // try direct match to save time
+        if(separate_var_list[0][0] == separate_var_list[1][0]){
+            DiploidVariant tv = separate_var_list[0][0];
+            string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(tv.pos+1) + "\t" + tv.ref + "\t" + tv.alts[0];
+            if(tv.multi_alts) match_record += "/" + tv.alts[1];
+            match_record += "\t.\t.\t.\t.\t.\n";
+            // here we need to push back for all mode_index
+            //complex_match_records[thread_index]->push_back(match_record);
+            for(int mi = 0; mi < mode_index_list.size(); mi ++){
+                int mode_i = mode_index_list[mi];
+                //if(mi == 0){
+                    match_records_by_mode_by_thread[thread_index][mode_i]->push_back(match_record);
+                //}else{
+                //    match_records_by_mode_by_thread[thread_index][mode_i]->push_back("$"+to_string(match_records_by_mode_by_thread[thread_index][0]->size()));
+                    // use dollor to represent that it is the same
+                //}
+                baseline_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+                query_total_match_num[thread_index][threshold_index]->at(mode_i)++;
+            }
+            // output match result
+            return true;
+        }
+        // if not match, still can match by changing genome
+    }else if(separate_var_list[0].size() == 1 || separate_var_list[1].size() == 1){
+        int flag = 0;
+        if(separate_var_list[1].size() == 1) flag = 1;
+        int r_flag = 1-flag;
+        if(separate_var_list[r_flag].size() > 4){
+            int total_r_mdl = 0;
+            int total_r_mil = 0;
+            for(int k = 0; k < separate_var_list[r_flag].size(); k++){
+                DiploidVariant var = separate_var_list[r_flag][k];
+                int var_mdl = var.mdl;
+                int var_mil = var.mil;
+                int ref_length = var.ref.length();
+                total_r_mdl += var_mdl;
+                total_r_mil += var_mil;
+            }
+            if(max(separate_var_list[flag][0].mdl, separate_var_list[flag][0].mil) > max(total_r_mdl, total_r_mil)) return false;
+        }
+    }
+    // remove singular variant
+    // [todo] try removing this filter to see running time changes
+    vector<bool> appliable_flag;
+    int total_change = total_mil+total_mdl;
+    for(int k = 0; k < variant_list.size(); k++){
+        DiploidVariant cur_var = variant_list[k];
+        int max_change = max(cur_var.mil, cur_var.mdl);
+        if(max_change > total_change-max_change){
+            appliable_flag.push_back(false);
+            //dout << "this variant is removed" << endl;
+        }else{
+            appliable_flag.push_back(true);
+        }
+    }
+    string subsequence = genome_sequences[chr_id].substr(min_pos, max_pos - min_pos);
+    ToUpper(subsequence); // subsequence only contains upper char
+    int offset = min_pos;
+    int subsequence_length = max_pos - min_pos;
+    // have subsequence in hand
+    //generate decision point
+    multimap<int, int> * choices_by_pos[2];
+    // choice by pos is to also equal to var by pos
+    for(int i = 0; i < 2; i++){
+        choices_by_pos[i] = new multimap<int, int>();
+    }
+    for(int index = 0; index < variant_list.size(); index++){
+        if(!appliable_flag[index]) continue;
+        // remove decision point if not applicable
+        int pos = variant_list[index].pos - offset;
+        int flag = 0;
+        if(variant_list[index].flag) flag = 1;
+        choices_by_pos[flag]->insert(pair<int, int>(pos, index));
+        //dout << pos << index << endl;
+    }
+    vector<Interval> mergered_intervals = merge(intervals);
+//    unordered_map<int, bool> sync_points;
+//    for(int i = 0; i < mergered_intervals.size(); i++){
+//        sync_points[mergered_intervals[i].end-offset] = true;
+//    }
+    vector<int> sync_points;
+    for(int i = 0; i < mergered_intervals.size(); i++){
+        sync_points.push_back(mergered_intervals[i].end-offset);
+    }
+    if(sync_points.back() < subsequence.size() - 1){
+        sync_points.push_back(subsequence.size()-1);
+    }
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int i = 0; i < score_unit_list.size(); i++){
+        score_unit = score_unit_list[i];
+        for(int j = 0; j < match_mode_list.size(); j++){
+            match_mode = match_mode_list[j];
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                score_scheme = score_scheme_list[k];
+                bool method2 = MatchingSingleClusterBaseExtending(
+                                            cluster_id,
+                                            thread_index,
+                                            variant_list,
+                                            subsequence,
+                                            offset,
+                                            choices_by_pos,
+                                            sync_points,
+                                            chr_id,
+                                            score_unit,
+                                            match_mode,
+                                            score_scheme,
+                                            threshold_index);
+            }
+        }
+    }
+    return true;
+bool WholeGenome::ClusteringMatchInThread(int start, int end, int thread_index) {
+	for (int cluster_id = start; cluster_id < end; cluster_id++) {
+        if(cluster_id >= variants_by_cluster.size()) break;
+        //dout << cluster_id << endl;
+        //bool method1 = MatchingSingleCluster(cluster_id, thread_index);
+        vector<VariantIndicator> vi_list = variants_by_cluster[cluster_id];
+        if(vi_list.size() <= 1) continue;
+        // create variant_list from vi_list;
+        for(int t = 0; t < threshold_num; t++){
+            double quality_threshold = threshold_list[t];
+            vector<DiploidVariant> variant_list;
+            int chr_id = -1;
+            for(int i = 0; i < vi_list.size(); i++){
+                VariantIndicator vi = vi_list[i];
+                chr_id = vi.chr_id;
+                int var_id = vi.var_id;
+                DiploidVariant var;
+                if(vi.refer){
+                    var = ref_variant_by_chrid[chr_id]->at(var_id);
+                }else{
+                    var = que_variant_by_chrid[chr_id]->at(var_id);
+                }
+                if(var.qual < quality_threshold) continue;
+                variant_list.push_back(var);
+            }
+            if(chr_id == -1 || chr_id >= chrom_num){
+                cout << "[VarMatch] Error in matching single cluster" << endl;
+                continue;
+            }
+            MatchVariantListInThread(thread_index, 
+                                    t,
+                                    chr_id,
+                                    variant_list,
+                                    cluster_id);
+        }
+        //if(method1 != method2){
+        //    cout << "not same result for cluster :" << cluster_id << ": " << method1 << "," << method2 << endl;
+        //}
+	}
+	return true;
+// to reduce memory usage of paths, move all functions about SequencePath out into WholeGenome with a parameter SequencePath
+bool WholeGenome::PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos){
+    for(int i = 0; i < 2; i++){
+        if(choices_by_pos[i]->find(pos) != choices_by_pos[i]->end()){
+            // you need to make choices now
+            if(sp.choice_made[i].find(pos) == sp.choice_made[i].end()){
+                // no choice made at current pos
+                return true;
+            }
+        }
+    }
+    return false;
+int WholeGenome::CheckPathEqualProperty(SequencePath & sp, int match_mode)
+    if(match_mode == 0){
+        //bool equal_sequences = false;
+        // same ref position, same donor length, same donor sequence, keep
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length() &&
+           sp.donor_sequences[1].length() == sp.donor_sequences[3].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2] && sp.donor_sequences[1] == sp.donor_sequences[3]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            for(int i = 0; i < 2; i++){
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }else{
+        if(sp.donor_sequences[0].length() == sp.donor_sequences[2].length()){
+            if(sp.donor_sequences[0] == sp.donor_sequences[2]){
+                sp.same_donor_len = true;
+                sp.current_equal_donor_pos[0] = sp.donor_sequences[0].length()-1;
+                //sp.current_equal_donor_pos[1] = sp.donor_sequences[1].length()-1;
+                return 0;
+            }else{
+                //dout << "delete this path at pos: " << sp.current_genome_pos << " for not equal donor sequence";
+                //PrintPath(sp);
+                return -1;
+            }
+        }else{
+            sp.same_donor_len = false;
+            int min_donor_identical_len[2];
+            //for(int i = 0; i < 2; i++)
+            int i = 0;
+            {
+                // compare each strain
+                min_donor_identical_len[i] = min(sp.donor_sequences[0+i].length(), sp.donor_sequences[2+i].length());
+                for(int k = sp.current_equal_donor_pos[i]+1; k < min_donor_identical_len[i]; k++){
+                    if(sp.donor_sequences[0+i][k] != sp.donor_sequences[2+i][k]){
+                        return -1;
+                    }
+                }
+                sp.current_equal_donor_pos[i] = min_donor_identical_len[i]-1;
+            }
+            return 0;
+        }
+    }
+// one step is not one nt, but to the next sync point
+// i.e. one step, one sync point
+int WholeGenome::PathExtendOneStep(SequencePath& sp,
+                                   multimap<int, int> * choices_by_pos[],
+                                   const string & reference_sequence,
+                                   vector<int> & sync_points,
+                                   int match_mode){
+    //-1 operation fail, path deleted
+    //0 operation succeed
+    //1 operation fail, need to make decision first, then extend
+    //2 path reached end, need to check if good
+    if(sp.reached_sync_num >= sync_points.size()) return -1;
+    int start_pos = sp.current_genome_pos + 1;
+    int end_pos = sync_points[sp.reached_sync_num]; // the next sync point, end pos included
+    for(int next_genome_pos = start_pos; next_genome_pos <= end_pos; next_genome_pos++){
+        // before make decision, we need to check if the equal property still holds
+        if(PathNeedDecision(sp, choices_by_pos, next_genome_pos)){
+            // check equal property
+            int statu = CheckPathEqualProperty(sp, match_mode);
+            if(statu == -1) return -1;
+            return 1; // need decision on next position
+        }
+        // else extend one nt
+        for(int i = 0; i < 4; i++){
+            if(match_mode == 1){
+                if(i%2 != 0) continue;
+            }
+            if(sp.string_sequences[i][next_genome_pos] == "."){
+                sp.donor_sequences[i] += reference_sequence[next_genome_pos];
+            }else{
+                sp.donor_sequences[i] += sp.string_sequences[i][next_genome_pos];
+            }
+        }
+        sp.current_genome_pos = next_genome_pos;
+    }
+    // reaches the end of end_pos
+    sp.reached_sync_num ++;
+    if(sp.reached_sync_num >= sync_points.size()){
+        // last sync point is the end of ref genome sequence
+        if(sp.donor_sequences[0] == sp.donor_sequences[2] &&
+           sp.donor_sequences[1] == sp.donor_sequences[3]){
+            return 2;
+       }else{
+            //dout << "delete this path at pos: " << sp.current_genome_pos << " for reach end but not equal";
+            //PrintPath(sp);
+            return -1;
+       }
+    }
+    return CheckPathEqualProperty(sp, match_mode);
+    // first try to converge, then extend
+int WholeGenome::CalculateScore(DiploidVariant & dv,
+                                int choice,
+                                int score_unit,
+                                int match_mode,
+                                int score_scheme){
+    int score = 0;
+    if(score_unit == 0){
+        score = 1;
+    }else if(score_unit == 1){
+        if(match_mode == 0){
+            if(choice == -1){
+                score += ScoreEditDistance(dv, 0);
+            }else if(choice == 0){
+                score += ScoreEditDistance(dv, 0);
+                if(dv.multi_alts){
+                    score += ScoreEditDistance(dv, 1);
+                }
+            }else{
+                score += ScoreEditDistance(dv, 0);
+                score += ScoreEditDistance(dv, 1);
+            }
+        }else{
+            score += ScoreEditDistance(dv, choice);
+        }
+    }
+    if(score_scheme == 0){
+        return score;
+    }else if(score_scheme == 1 || score_scheme == 2){
+        if(dv.flag == false && score_scheme == 1){
+            return score;
+        }else if(dv.flag && score_scheme == 2){
+            return score;
+        }else{
+            return 0;
+        }
+    }
+// no genotype means you can maintain only one strand
+// for simplicity, also work on original SequencePath data structure
+// when making decision, only decide one path
+// when extending, only extend one path
+// when comparing, only compare one path
+bool WholeGenome::PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // to maintain existance
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){
+                alts[1] = var.alts[1];
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                int y = 0;
+                // for each strain
+                if(sp.string_sequences[i*2+y][k+pos] != "."){
+                    // decision in this area has already been made
+                    if(k >= alts[y].length()){
+                        choice_applicable = false;
+                        break;
+                    }else{
+                        if(ref[k] != alts[y][k]){
+                            choice_applicable = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.multi_alts){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    //for(int y = 0; y < 2; y++)
+                    int y = 0;
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            // should be a deletion
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            // should be equal at current position
+                            // can be an insertion, as long as current position is the same
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(choice_applicable){
+                    candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    alts[0] = var.alts[c];
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    int y = 0;
+                    int k = 0;
+                    for(; k < ref.length()-1; k++){
+                        if(k < alts[y].length()){
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+                            }
+                            // else change nothing
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                    // hence k == ref.length()-1, the last position
+                    if(k < alts[y].length()){
+                        string alt_part = alts[y].substr(k, alts[y].length()-k);
+                        if(alt_part.length() > 1){
+                            if(alt_part[0] == ref[k]){
+                                if(path.string_sequences[x*2+y][pos+k] == "."){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                }
+                            }else{
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                path.string_sequences[x*2+y][pos+k] = alt_part;
+                            }
+                        }
+                    }else{
+                        path.string_sequences[x*2+y][pos+k] = "";
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+bool WholeGenome::PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            //PrintVariant(var);
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){
+                alts[1] = var.alts[1];
+            }else if(var.heterozygous){
+                alts[1] = ref;
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                for(int y = 0; y < 2; y++){
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(!choice_applicable) break;
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.heterozygous){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    for(int y = 0; y < 2; y++){
+                        // for each strain
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    if(var.multi_alts){
+                        candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                    }else{
+                        candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+                    }
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+//                    string temp_sequence = reference_sequence.substr(pos, 1);
+//                    path.string_sequences[x*2][pos] = temp_sequence;
+//                    path.string_sequences[x*2+1][pos] = temp_sequence;
+//                }else{
+                    // set score
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    if(c == -1){
+                        alts[0] = ref;
+                        alts[1] = var.alts[0];
+                    }else{
+                        // c == 0 or 1
+                        alts[0] = var.alts[c];
+                        alts[1] = alts[0];
+                        if(var.multi_alts){
+                            // choose 1 or 0
+                            alts[1] = var.alts[1- c];
+                        }else{
+                            // c is 0, choose 0 or -1
+                            if(var.heterozygous) alts[1] = ref;
+                        }
+                    }
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    ToUpper(alts[1]);
+                    for(int y = 0; y < 2; y++){
+                        // iterate two alts
+                        string alt = alts[y];
+                        vector<string> alt_vector;
+                        GenerateAltVector(ref, alt, alt_vector);
+                        int k = 0;
+                        for(; k < ref.length()-1; k++){
+                            if(alt_vector[k].size() != 1 || ref[k] != alt_vector[k][0]){
+                                path.string_sequences[x*2+y][pos+k] = alt_vector[k];
+                            }
+                            // else changes nothing
+                        }
+                        // hence k == ref.length()-1, the last position
+                        assert(k == ref.length()-1);
+                        string alt_part = alt_vector[k];
+                        if(alt_part.length() > 0){
+                            if(alt_part.length() > 1){
+                                if(alt_part[0] == ref[k]){
+                                    if(path.string_sequences[x*2+y][pos+k] == "."){
+                                        path.string_sequences[x*2+y][pos+k] = alt_part;
+                                    }else{
+                                        path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                    }
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }else{
+                                if(ref[k] != alt_vector[k][0]){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            // choice made
+            //dout << "after decision at pos " << pos << endl;
+            //PrintPath(path);
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+bool WholeGenome::PathMakeDecisionBackup(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme)
+    int pos = sp.current_genome_pos+1;
+    vector<pair<int, int>> candidate_choices[2];
+    for(int i = 0; i < 2; i++){
+        // because if it's (-1,-1), it will do nothing, so it's ok to have this one...
+        candidate_choices[i].push_back(pair<int, int>(-1, -1));
+        // in this position, make choice of not use any variants, no matter if there is variant
+        pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+        var_range = choices_by_pos[i]->equal_range(pos);
+        for(auto it = var_range.first; it != var_range.second; ++it){
+            int var_index = (*it).second;
+            DiploidVariant var = variant_list[var_index];
+            //PrintVariant(var);
+            // check if current var influence
+            string ref = var.ref; //even we do not know the offset, we know ref start from pos of reference_sequence
+            string alts[2];
+            alts[0] = var.alts[0];
+            alts[1] = alts[0];
+            if(var.multi_alts){
+                alts[1] = var.alts[1];
+            }else if(var.heterozygous){
+                alts[1] = ref;
+            }
+            // not just purely consider if a vqriant can be applied, but if a choice
+            bool choice_applicable = true;
+            for(int k = 0; k < ref.length(); k++){
+            // for each ref char
+                for(int y = 0; y < 2; y++){
+                    // for each strain
+                    if(sp.string_sequences[i*2+y][k+pos] != "."){
+                        // decision in this area has already been made
+                        if(k >= alts[y].length()){
+                            choice_applicable = false;
+                            break;
+                        }else{
+                            if(ref[k] != alts[y][k]){
+                                choice_applicable = false;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if(!choice_applicable) break;
+            }
+            if(choice_applicable){
+                candidate_choices[i].push_back(pair<int, int>(var_index, 0));
+            }
+            if(var.heterozygous){
+                //if heterozygous, then there is another choice, check if it is applicable
+                string temp = alts[0];
+                alts[0] = alts[1];
+                alts[1] = temp;
+                choice_applicable = true;
+                for(int k = 0; k < ref.length(); k++){
+                // for each ref char
+                    for(int y = 0; y < 2; y++){
+                        // for each strain
+                        if(sp.string_sequences[i*2+y][k+pos] != "."){
+                            // decision in this area has already been made
+                            if(k >= alts[y].length()){
+                                // should be a deletion
+                                choice_applicable = false;
+                                break;
+                            }else{
+                                // should be equal at current position
+                                // can be an insertion, as long as current position is the same
+                                if(ref[k] != alts[y][k]){
+                                    choice_applicable = false;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                    if(!choice_applicable) break;
+                }
+                if(choice_applicable){
+                    if(var.multi_alts){
+                        candidate_choices[i].push_back(pair<int, int>(var_index, 1));
+                    }else{
+                        candidate_choices[i].push_back(pair<int, int>(var_index, -1));
+                    }
+                }
+            }
+        }
+    }
+    //dout << candidate_choices[0].size() << "," << candidate_choices[1].size() << endl;
+    for(int i = 0; i < candidate_choices[0].size(); i++){
+        for(int j = 0; j < candidate_choices[1].size(); j++){
+            // iterate all choices
+            SequencePath path = sp;
+            pair<int, int> var_choice[2];
+            var_choice[0] = candidate_choices[0][i];
+            var_choice[1] = candidate_choices[1][j];
+            for(int x = 0; x < 2; x++){
+                // iterate truth and predict
+                int var_index = var_choice[x].first;
+                if(var_index != -1){
+//                    string temp_sequence = reference_sequence.substr(pos, 1);
+//                    path.string_sequences[x*2][pos] = temp_sequence;
+//                    path.string_sequences[x*2+1][pos] = temp_sequence;
+//                }else{
+                    // set score
+                    DiploidVariant var = variant_list[var_index];
+                    // if(var.flag != x){
+                    //     dout << "Error" << endl;
+                    // }
+                    string ref = var.ref;
+                    string alts[2];
+                    int c = var_choice[x].second;
+                    if(c == -1){
+                        alts[0] = ref;
+                        alts[1] = var.alts[0];
+                    }else{
+                        // c == 0 or 1
+                        alts[0] = var.alts[c];
+                        alts[1] = alts[0];
+                        if(var.multi_alts){
+                            // choose 1 or 0
+                            alts[1] = var.alts[1- c];
+                        }else{
+                            // c is 0, choose 0 or -1
+                            if(var.heterozygous) alts[1] = ref;
+                        }
+                    }
+                    path.score += CalculateScore(var,
+                                                 c,
+                                                 score_unit,
+                                                 match_mode,
+                                                 score_scheme);
+                    ToUpper(ref);
+                    ToUpper(alts[0]);
+                    ToUpper(alts[1]);
+                    for(int y = 0; y < 2; y++){
+                        // iterate two alts
+                        int k = 0;
+                        for(; k < ref.length()-1; k++){
+                            if(k < alts[y].length()){
+                                if(ref[k] != alts[y][k]){
+                                    path.string_sequences[x*2+y][pos+k] = alts[y].substr(k,1);
+                                }
+                                // else change nothing
+                            }else{
+                                path.string_sequences[x*2+y][pos+k] = "";
+                            }
+                        }
+                        // hence k == ref.length()-1, the last position
+                        if(k < alts[y].length()){
+                            string alt_part = alts[y].substr(k, alts[y].length()-k);
+                            if(alt_part.length() > 1){
+                                if(alt_part[0] == ref[k]){
+                                    if(path.string_sequences[x*2+y][pos+k] == "."){
+                                        path.string_sequences[x*2+y][pos+k] = alt_part;
+                                    }else{
+                                        path.string_sequences[x*2+y][pos+k] += alt_part.substr(1, alt_part.size() - 1);
+                                    }
+                                }else{
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }else{
+                                if(ref[k] != alts[y][k]){
+                                    path.string_sequences[x*2+y][pos+k] = alt_part;
+                                }
+                            }
+                        }else{
+                            path.string_sequences[x*2+y][pos+k] = "";
+                        }
+                    }
+                }
+                path.choice_made[x][pos] = var_choice[x];
+            }
+            // choice made
+            //dout << "after decision at pos " << pos << endl;
+            //PrintPath(path);
+            sequence_path_list.push_back(path);
+        }
+    }
+    //expected number of inserted paths are 2,3,4,6,x...
+    return true;
+void WholeGenome::PrintPath(SequencePath & sp){
+    cout << "- Sequence Path:" << endl;
+    cout << "@ String Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        for(int j = 0; j < sp.string_sequences[i].size(); j++){
+            cout << sp.string_sequences[i][j] << " ";
+        }
+        cout << endl;
+    }
+    cout << "@ Donor Sequences:" << endl;
+    for(int i = 0; i < 4; i++){
+        cout << sp.donor_sequences[i] << endl;
+    }
+    cout << "@ Removable: " << sp.removable << endl;
+// next: while until current path list is empty
+// if extend, add to next path list
+// if need decision, make decision, append to current list
+// if reach end, compare with best path
+bool WholeGenome::MatchingSingleClusterBaseExtending(int cluster_index,
+                                                    int thread_index,
+                                                    vector<DiploidVariant> & variant_list,
+                                                    string & subsequence,
+                                                    int offset,
+                                                    multimap<int, int> * choices_by_pos[],
+                                                    vector<int> & sync_points,
+                                                    int chr_id,
+                                                    int score_unit,
+                                                    int match_mode,
+                                                    int score_scheme,
+                                                    int threshold_index){
+    //--------------for unit test------------------------------
+    //dout << variant_list.size() << endl;
+    //int chr_id = 0;
+    //-------------end unit test-------------------------------
+    // so a legal sync_points vector contains at least two
+    // first is the end of variant, there should be at least one variant
+    // second is the end of subsequence, there should be at least one nt not influenced by a variant
+    list<SequencePath> current_path_list;
+    list<SequencePath> next_path_list;
+    SequencePath sp(subsequence.length());
+    SequencePath best_path = sp;
+    current_path_list.push_back(sp);
+    while(current_path_list.size() != 0){
+        bool reach_sync_point = true;
+        while(current_path_list.size() != 0){
+            SequencePath path = current_path_list.front();
+            current_path_list.pop_front();
+            //dout << path.current_genome_pos << ":" << current_path_list.size() << endl;
+            //PrintPath(path);
+            int is_extend = PathExtendOneStep(path, choices_by_pos, subsequence, sync_points, match_mode);
+            //if(cluster_index == 220730) PrintPath(path);
+            if(is_extend == -1){
+                continue;
+            }
+            else if(is_extend == 0){
+                next_path_list.push_back(path);
+                // here the path is supposed to reach the next sync point
+            }else if(is_extend == 1){
+                if(match_mode == 0){
+                    PathMakeDecision(path,
+                                     variant_list,
+                                     choices_by_pos,
+                                     current_path_list,
+                                     subsequence,
+                                     score_unit,
+                                     match_mode,
+                                     score_scheme);
+                }else{
+                    PathMakeDecisionNoGenotype(path,
+                                               variant_list,
+                                               choices_by_pos,
+                                               current_path_list,
+                                               subsequence,
+                                               score_unit,
+                                               match_mode,
+                                               score_scheme);
+                }
+            }else if(is_extend == 2){
+                if(path.score > best_path.score){
+                    best_path = path; // only when you reach the very end can you be considered as best path
+                    //PrintPath(best_path);
+                }
+            }
+        }
+        current_path_list = next_path_list;
+        next_path_list.clear();
+        if(current_path_list.size() > 0){
+            //int current_genome_pos = current_path_list.front().current_genome_pos;
+            // after revise, we do not need this check
+            //if(sync_points.find(current_genome_pos) != sync_points.end()){
+                //dout << "converge paths at position: " << current_genome_pos << endl;
+                //dout << "before converge: " << current_path_list.size() << endl;
+                ConvergePaths(current_path_list);
+                //dout << "after converge: " << current_path_list.size() << endl;
+            //}
+        }
+    }
+    // print best_path
+    if(best_path.score <= 0) return false;
+    //dout << "new method: " << best_path.score << endl;
+    //==========================output ======================
+    int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+    if(match_mode == 0){
+        ConstructMatchRecord(best_path,
+                             variant_list,
+                             subsequence,
+                             offset,
+                             thread_index,
+                             chr_id,
+                             mode_index,
+                             threshold_index);
+    }else{
+        ConstructMatchRecordNoGenotype(best_path,
+                                       variant_list,
+                                       subsequence,
+                                       offset,
+                                       thread_index,
+                                       chr_id,
+                                       mode_index,
+                                       threshold_index);
+    }
+    return true;
+void WholeGenome::ConstructMatchRecord(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool need_match_record = false;
+    if (threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    if(best_path.donor_sequences[0] != best_path.donor_sequences[1]) multiple_match = true;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[1];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            if (phasing == -1) phasing = 1;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1";
+                    if(variant.heterozygous){
+                        if(variant.multi_alts){
+                            phasing_string += "|2";
+                        }else{
+                            phasing_string += "|0";
+                        }
+                    }else{
+                        phasing_string += "|1";
+                    }
+                }else if(phasing == 1){
+                    if(variant.multi_alts){
+                        phasing_string += "2|1";
+                    }else{
+                        phasing_string += "0|1";
+                    }
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+	}
+    if(need_match_record){
+    	match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+    	match_record += "\t" + to_string(best_path.score) + "\n";
+    	//complex_match_records[thread_index]->push_back(match_record);
+    	match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+void WholeGenome::ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                                 vector<DiploidVariant> & variant_list,
+                                                 string & subsequence,
+                                                 int offset,
+                                                 int thread_index,
+                                                 int chr_id,
+                                                 int mode_index,
+                                                 int threshold_index){
+    int truth_num = 0;
+    int predict_num = 0;
+    bool need_match_record = false;
+    if(threshold_index == 0) need_match_record = true;
+    bool multiple_match = false;
+    string parsimonious_ref = subsequence;
+    string parsimonious_alt0 = best_path.donor_sequences[0];
+    string parsimonious_alt1 = best_path.donor_sequences[0];
+    int parsimonious_pos = offset;
+//    NormalizeVariantSequence(offset,
+//                             parsimonious_ref,
+//                             parsimonious_alt0,
+//                             parsimonious_alt1,
+//                             chr_id);
+    string match_record = chrname_by_chrid[chr_id] + "\t" + to_string(parsimonious_pos+1) + "\t" + parsimonious_ref + "\t" + parsimonious_alt0;
+    //if(multiple_match) match_record += "/" + parsimonious_alt1;
+    string vcf_record[2];
+    string phasing_record[2];
+	for (int i = 0; i < 2; i++) {
+		for (auto it = best_path.choice_made[i].begin(); it != best_path.choice_made[i].end(); ++it) {
+            pair<int, int> selection = it->second;
+            int phasing = selection.second;
+            if(selection.first == -1) continue;
+            if (phasing == -1) continue;
+            DiploidVariant variant = variant_list[selection.first];
+            if(!variant.flag){
+                truth_num++;
+            }else{
+                predict_num++;
+            }
+            if(need_match_record){
+                string alt_string = variant.alts[0];
+                if(variant.multi_alts){
+                    alt_string += "/" + variant.alts[1];
+                }
+                string phasing_string = "";
+                if(phasing == 0){
+                    phasing_string += "1|1";
+                }else if(phasing == 1){
+                    phasing_string += "2|2";
+                }
+                string variant_record = to_string(variant.pos+1) + "," + variant.ref + "," + alt_string;
+                vcf_record[i] += variant_record;
+                phasing_record[i] += phasing_string;
+                vcf_record[i] += ";";
+                phasing_record[i] += ";";
+            }
+		}
+        if(need_match_record){
+            vcf_record[i] = vcf_record[i].substr(0, vcf_record[i].size()-1);
+            phasing_record[i] = phasing_record[i].substr(0, phasing_record[i].size()-1);
+        }
+	}
+    if(need_match_record){
+	   match_record += "\t" + vcf_record[0] + "\t" + vcf_record[1];
+        match_record += "\t" + phasing_record[0] + "\t" + phasing_record[1];
+	   match_record += "\t" + to_string(best_path.score) + "\n";
+	   //complex_match_records[thread_index]->push_back(match_record);
+	   match_records_by_mode_by_thread[thread_index][mode_index]->push_back(match_record);
+    }
+    baseline_total_match_num[thread_index][threshold_index]->at(mode_index) += truth_num;
+    query_total_match_num[thread_index][threshold_index]->at(mode_index) += predict_num;
+bool WholeGenome::DonorLengthEqual(SequencePath & a, SequencePath & b){
+    bool truth_same = false;
+    bool query_same = false;
+    if(a.donor_sequences[0].length() == b.donor_sequences[0].length() &&
+       a.donor_sequences[1].length() == b.donor_sequences[1].length()){
+        truth_same = true;
+    }
+    else if(a.donor_sequences[0].length() == b.donor_sequences[1].length() &&
+            a.donor_sequences[1].length() == b.donor_sequences[0].length()){
+                truth_same = true;
+            }
+    if(a.donor_sequences[2].length() == b.donor_sequences[2].length() &&
+       a.donor_sequences[3].length() == b.donor_sequences[3].length()){
+        query_same = true;
+    }
+    else if(a.donor_sequences[2].length() == b.donor_sequences[3].length() &&
+            a.donor_sequences[3].length() == b.donor_sequences[2].length()){
+                query_same = true;
+            }
+    if(truth_same && query_same) return true;
+    return false;
+bool IsRemovable(SequencePath & s){ return s.removable;}
+void WholeGenome::ConvergePaths(list<SequencePath> & path_list){
+    //dout << "===========start converge===================" << endl;
+    int path_num = path_list.size();
+    if(path_num <= 1) return;
+    for(list<SequencePath>::iterator i = path_list.begin(); i!= path_list.end(); ++i){
+        SequencePath  ref_path = *i;
+        if(ref_path.removable) continue;
+        if(!ref_path.same_donor_len) continue;
+        list<SequencePath>::iterator j = i;
+        ++j;
+        for(; j != path_list.end(); ++j){
+            SequencePath que_path = *j;
+            if(que_path.removable) continue;
+            if(!que_path.same_donor_len) continue;
+            //dout << "Comparing following paths: " << endl;
+            //PrintPath(ref_path);
+            //PrintPath(que_path);
+            if(DonorLengthEqual(ref_path, que_path)){
+                if(ref_path.score >= que_path.score){
+                    (*j).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*j));
+                }else{
+                    (*i).removable = true;
+                    //dout << "delete path: " << endl;
+                    //PrintPath((*i));
+                    break;
+                }
+            }
+            //dout << "-    -     -   -   -   -   -  - - -" << endl;
+        }
+    }
+    path_list.remove_if(IsRemovable);
+int WholeGenome::test() {
+	genome_sequences[0] = "GTCAGCCGG";
+	DiploidVariant d1(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,0);
+	DiploidVariant d2(4, "G", vector<string> ({"C", ""}), true, false, 0,0,0);
+	DiploidVariant d3(5, "C", vector<string> ({"T", ""}), true, false, 0,0,0); // this is false negative
+	DiploidVariant d4(6, "C", vector<string> ({"G", ""}), true, false, 0,0,0);
+	DiploidVariant d5(7, "G", vector<string> ({"A", ""}), true, false, 0,0,0);
+	DiploidVariant d6(1, "T", vector<string> ({"A", "C"}), true, true, 0,0,1);
+	DiploidVariant d7(3, "AG", vector<string> ({"A", ""}), true, false, 1,0,1);
+	DiploidVariant d8(7, "G", vector<string> ({"GA", ""}), true, false, 0,1,1);
+    //complex_match_records = new vector<string>*[1];
+    //complex_match_records[0] = new vector<string>;
+	//vector<DiploidVariant> var_list = { d2,d3,d4,d5,d7,d8 };
+	vector<DiploidVariant> var_list = { d1,d2,d3,d4,d5,d6,d7,d8 };
+	//cout << MatchingSingleClusterBaseExtending(var_list, 0) << endl;
+	//cout << complex_match_records[0]->at(0) << endl;
+	return 0;
+// private
+void WholeGenome::ClusteringMatchMultiThread() {
+	int start = 0;
+	int cluster_number = variants_by_cluster.size(); // cluster number
+	int cluster_end_boundary = start + cluster_number; // end cluster id, exclusive
+	int cluster_step = cluster_number / thread_num; // assign clusters to threads
+	if (cluster_step * thread_num < cluster_number) cluster_step++;
+	int end = start + cluster_step;
+	//initialize vector size
+	//complex_match_records = new vector<string>*[thread_num];
+	match_records_by_mode_by_thread = new vector<string>**[thread_num];
+    //query_matches_by_mode_by_thread = new vector<int> ** [thread_num];
+	for(int i = 0; i < thread_num; i++){
+        match_records_by_mode_by_thread[i] = new vector<string>*[MATCH_MODE_NUM];
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            match_records_by_mode_by_thread[i][j] = new vector<string>;
+        }
+	}
+    baseline_total_match_num = new vector<int>** [thread_num];
+    query_total_match_num = new vector<int> ** [thread_num];
+    for(int i = 0; i < thread_num; i++){
+        baseline_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+        query_total_match_num[i] = new vector<int>* [ROC_SAMPLE_NUM];
+        for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+            baseline_total_match_num[i][j] = new vector<int>;
+            baseline_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+            query_total_match_num[i][j] = new vector<int>;
+            query_total_match_num[i][j]->resize(MATCH_MODE_NUM, 0);
+        }
+    }
+	vector<thread> threads;
+	//spawn threads
+	unsigned i = 0;
+	for (; i < thread_num - 1; i++) {
+		threads.push_back(thread(&WholeGenome::ClusteringMatchInThread, this, start, end, i));
+		start = end;
+		end = start + cluster_step;
+	}
+	// also you need to do a job in main thread
+	// i equals to (thread_num - 1)
+	if (i != thread_num - 1) {
+		dout << "[Error] thread number not match" << endl;
+	}
+	if (start >= variants_by_cluster.size()) {
+		dout << "[Error] index out of map range" << endl;
+	}
+	else {
+		ClusteringMatchInThread(start, end, i);
+	}
+	// call join() on each thread in turn before this function?
+    std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
+    //output all results
+    cout << "writing results..." << endl;
+    ofstream output_stat_file;
+    output_stat_file.open(output_dir + "/" + output_prefix+".stat");
+    cout << "=========VarMatch Result Stat.=======" << endl;
+    string stat_head_string = "#score_unit\tmatch_mode\tscore_unit\tqual_threshold\tbaseline_match_num\tquery_match_num\tquery_total_num";
+    cout << stat_head_string << endl;
+    output_stat_file << "##Baseline:" << baseline_variant_total_num << endl;
+    output_stat_file << "##Query:"<< query_variant_total_num << endl;
+    output_stat_file << stat_head_string << endl;
+    int score_unit;
+    int match_mode;
+    int score_scheme;
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                int total_ref_complex = 0;
+                int total_que_complex = 0;
+                string threshold_string = "";
+                string baseline_match_num_string = "";
+                string query_match_num_string = "";
+                string query_total_num_string = "";
+                for(int t = 0; t < threshold_num; t++){
+                    threshold_string += to_string(threshold_list[t]);
+                    int baseline_match_num_by_threshold_by_mode = 0;
+                    int query_match_num_by_threshold_by_mode = 0;
+                    for(int i = 0; i < thread_num; i++){
+                        baseline_match_num_by_threshold_by_mode += baseline_total_match_num[i][t]->at(mode_index);
+                        query_match_num_by_threshold_by_mode += query_total_match_num[i][t]->at(mode_index);
+                    }
+                    baseline_match_num_string += to_string(baseline_match_num_by_threshold_by_mode);
+                    query_match_num_string += to_string(query_match_num_by_threshold_by_mode);
+                    query_total_num_string += to_string((int)(query_variant_total_num * (1-per_list[t])) );
+                    if(t < threshold_num-1){
+                        threshold_string += ",";
+                        baseline_match_num_string += ",";
+                        query_match_num_string += ",";
+                        query_total_num_string += ",";
+                    }
+                }
+                string total_match_num_string = to_string(score_unit) + "\t" +
+                                                to_string(match_mode) + "\t" + 
+                                                to_string(score_scheme) + "\t" +
+                                                threshold_string + "\t" +
+                                                baseline_match_num_string + "\t" + 
+                                                query_match_num_string + "\t" + 
+                                                query_total_num_string;// + "\t" + to_string(mode_index);
+                cout << total_match_num_string << endl;
+                output_stat_file << total_match_num_string << endl;
+            }
+        }
+    }
+    output_stat_file.close();
+    int bench_mode_index = GetIndexFromMatchScore(0, 0, 0);
+    for(int x = 0; x < score_unit_list.size(); x++){
+        score_unit = score_unit_list[x];
+        for(int y = 0; y < match_mode_list.size(); y++){
+            match_mode = match_mode_list[y];
+            for(int z = 0; z < score_scheme_list.size(); z++){
+                score_scheme = score_scheme_list[z];
+                int mode_index = GetIndexFromMatchScore(score_unit, match_mode, score_scheme);
+                string filename_index = to_string(score_unit) + "_" + to_string(match_mode) + "_" + to_string(score_scheme);
+                ofstream output_complex_file;
+                output_complex_file.open(output_dir + "/" + output_prefix+"."+filename_index+".match");
+                output_complex_file << "##VCF1:" << ref_vcf_filename << endl;
+                output_complex_file << "##VCF2:" << que_vcf_filename << endl;
+                output_complex_file << "#CHROM\tPOS\tREF\tALT\tVCF1\tVCF2\tPHASE1\tPHASE2\tSCORE" << endl;
+                for(int i = 0; i < thread_num; i++){
+                    for(int k = 0; k < match_records_by_mode_by_thread[i][mode_index]->size(); k++){
+                        if (match_records_by_mode_by_thread[i][mode_index]->at(k).find_first_not_of(' ') != std::string::npos) {
+                            //if(match_records_by_mode_by_thread[i][mode_index]->at(k)[0] == '$'){
+                                //int bench_mode_index = stoi(match_records_by_mode_by_thread[i][mode_index]->at(k).erase(0,1));
+                                //output_complex_file << match_records_by_mode_by_thread[i][0]->at(k);
+                            //}else{
+                                output_complex_file << match_records_by_mode_by_thread[i][mode_index]->at(k);
+                            //}
+                        }
+                    }
+                }
+                output_complex_file.close();
+            }
+        }
+    }
+    // clear all matching records
+	for(int i = 0; i < thread_num; i++){
+        for(int j = 0; j < MATCH_MODE_NUM; j++){
+            delete match_records_by_mode_by_thread[i][j];
+        }
+        for(int j = 0; j < ROC_SAMPLE_NUM; j++){
+            delete baseline_total_match_num[i][j];
+            delete query_total_match_num[i][j];
+        }
+        delete[] match_records_by_mode_by_thread[i];
+        delete[] baseline_total_match_num[i];
+        delete[] query_total_match_num[i];
+	}
+	delete[] match_records_by_mode_by_thread;
+    delete[] baseline_total_match_num;
+    delete[] query_total_match_num;
+int WholeGenome::NormalizeVariantSequence(int pos, string & parsimonious_ref, string & parsimonious_alt0, string & parsimonious_alt1, int chr_id) {
+	int left_index = pos;
+	if (genome_sequences[chr_id].size() == 0) return -1;
+	if (parsimonious_ref.size() == 1 && parsimonious_alt0.size() == 1 && parsimonious_alt1.size() == 1) return true;
+	bool change_in_allels = true;
+	while (change_in_allels) {
+		change_in_allels = false;
+		if (parsimonious_ref.back() == parsimonious_alt0.back() && parsimonious_ref.back() == parsimonious_alt1.back() ) {
+			if ((parsimonious_ref.size() > 1 && parsimonious_alt0.size() > 1 && parsimonious_alt1.size() > 1) || left_index > 0) { // when left_index == 0, can not make further changes
+				parsimonious_ref.pop_back();
+				parsimonious_alt0.pop_back();
+				parsimonious_alt1.pop_back();
+				change_in_allels = true;
+			}
+            // else do not make further changes
+		}
+		if (parsimonious_ref.length() == 0 || parsimonious_alt0.length() == 0 || parsimonious_alt1.length() == 0) {
+			left_index--;
+			char left_char = toupper(genome_sequences[chr_id][left_index]);
+			parsimonious_ref = left_char + parsimonious_ref;
+			parsimonious_alt0 = left_char + parsimonious_alt0;
+			parsimonious_alt1 = left_char + parsimonious_alt1;
+		}
+	}
+	while (parsimonious_ref[0] == parsimonious_alt0[0] &&
+            parsimonious_ref[0] == parsimonious_alt1[0] &&
+            parsimonious_ref.size() > 1 &&
+            parsimonious_alt0.size() > 1 &&
+            parsimonious_alt1.size() > 1)
+    {
+		parsimonious_ref.erase(0, 1);
+		parsimonious_alt0.erase(0, 1);
+		parsimonious_alt1.erase(0, 1);
+        left_index ++; // left_index indicates variant position, if truncate the leftmost, then
+	}
+	return left_index;
+void WholeGenome::SingleThreadClustering(int chr_id) {
+	int ins_len[2] = { 0 };
+	int del_len[2] = { 0 };
+	int c_start = 0;
+	int c_end = 0;
+    sort(ref_variant_by_chrid[chr_id]->begin(), ref_variant_by_chrid[chr_id]->end());
+    sort(que_variant_by_chrid[chr_id]->begin(), que_variant_by_chrid[chr_id]->end());
+    int ref_size = ref_variant_by_chrid[chr_id]->size();
+    int que_size = que_variant_by_chrid[chr_id]->size();
+    //dout << chr_id << "," << ref_size << "," << que_size << endl;
+    int ref_index = 0;
+    int que_index = 0;
+    bool not_first = false;
+    DiploidVariant snp;
+    vector<VariantIndicator> vi_list;
+    while (ref_index < ref_size || que_index < que_size) {
+		bool take_que = true;
+		if(ref_index < ref_size && que_index < que_size){
+            if(ref_variant_by_chrid[chr_id]->at(ref_index).pos < que_variant_by_chrid[chr_id]->at(que_index).pos){
+                take_que = false;
+            }
+		}else if(ref_index < ref_size){
+            take_que = false;
+		}
+        int var_index;
+		if(take_que){
+            snp = que_variant_by_chrid[chr_id]->at(que_index);
+            //cout << "q |" << que_index << "," << snp.pos << endl;
+            var_index = que_index;
+            que_index++;
+		}else{
+            snp = ref_variant_by_chrid[chr_id]->at(ref_index);
+            //cout << "r |" << ref_index << "," << snp.pos << endl;
+            var_index = ref_index;
+            ref_index++;
+		}
+		// check if need to separator clusters
+		if (not_first) {
+			c_end = snp.pos;
+			if (c_end - c_start >= 2) {
+                int separator_length = c_end - c_start;
+				string separator = genome_sequences[chr_id].substr(c_start, separator_length);
+				int max_change = max(ins_len[0] + del_len[1], ins_len[1] + del_len[0]);
+				bool separate_cluster = false;
+				if(max_change == 0){
+                    separate_cluster = true;
+				}
+				else if (separator_length > 2 * max_change &&
+					(separator_length > MAX_REPEAT_LEN || !CheckTandemRepeat(separator, max_change)))
+				{
+				    separate_cluster = true;
+				}
+				if(separate_cluster){
+                    variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+                    vi_list.clear();
+					ins_len[0] = 0;
+					del_len[0] = 0;
+					ins_len[1] = 0;
+					del_len[1] = 0;
+					c_start = 0; // re-assign c_start
+				}
+			}
+		}
+		c_start = max(c_start, snp.pos + (int)snp.ref.length() );
+        VariantIndicator current_variant_indicator(chr_id, var_index, !take_que);
+        vi_list.push_back(current_variant_indicator);
+		//cluster_vars_map[cluster_index].push_back(snp);
+		if(!not_first) not_first = true;
+		int ref_length = (int)(snp.ref.length());
+		int flag = 0;
+        if(snp.flag) flag = 1;
+//        DiploidVariant snp = front_cluster[k];
+//        int rq = snp.flag;
+        ins_len[flag] += snp.mil;
+        del_len[flag] += snp.mdl;
+	}
+    if(vi_list.size() > 0){
+        variant_cluster_by_chrid[chr_id]->push_back(vi_list);
+    }
+int WholeGenome::ReadReferenceVariants(string filename){
+    return ReadWholeGenomeVariant(filename, false);
+int WholeGenome::ReadQueryVariants(string filename){
+    return ReadWholeGenomeVariant(filename, true);
+void WholeGenome::ReadRef(string genome_seq, string ref_vcf){
+    ReadWholeGenomeSequence(genome_seq);
+    baseline_variant_total_num = ReadReferenceVariants(ref_vcf);
+    ref_vcf_filename = ref_vcf;
+void WholeGenome::Compare(string query_vcf,
+	string output_prefix,
+    bool detail_results,
+    int score_unit_,
+    int match_mode_,
+    int score_scheme_)
+    // initialize query variant data structure
+    if(score_scheme_indicator == 3){
+        DirectMatch(ref_vcf_filename, query_vcf);
+        return;
+    }
+	que_vcf_filename = query_vcf;
+    this->output_prefix = output_prefix;
+    this->detail_results = detail_results;
+    score_unit_indicator = score_unit_;
+    match_mode_indicator = match_mode_;
+    score_scheme_indicator = score_scheme_;
+    query_variant_total_num = ReadQueryVariants(query_vcf);
+    if(score_unit_indicator == -1){
+        score_unit_list.push_back(0);
+        score_unit_list.push_back(1);
+    }else{
+        score_unit_list.push_back(score_unit_indicator);
+    }
+    if(match_mode_indicator == -1){
+        match_mode_list.push_back(0);
+        match_mode_list.push_back(1);
+    }else{
+        match_mode_list.push_back(match_mode_indicator);
+    }
+    if(score_scheme_indicator == -1){
+        score_scheme_list.push_back(0);
+        score_scheme_list.push_back(1);
+        score_scheme_list.push_back(2);
+    }else{
+        score_scheme_list.push_back(score_scheme_indicator);
+    }
+    for(int i = 0; i < score_unit_list.size(); i++){
+        for(int j = 0; j < match_mode_list.size(); j++){
+            for(int k = 0; k < score_scheme_list.size(); k++){
+                int mode_index = GetIndexFromMatchScore(score_scheme_list[i], match_mode_list[j], score_scheme_list[k]);
+                mode_index_list.push_back(mode_index);  // so that I can directly know how many mode, do not need to calculate all the time
+            }
+        }
+    }
+    cout << "Baseline VCF: " << ref_vcf_filename << endl;
+    cout << "Query VCF: " << query_vcf << endl;
+    cout << "========VCF Stat.==========" << endl;
+    cout << "Total Number of VCF Entries: " << endl;
+    cout << "Baseline: " << baseline_variant_total_num << "; Query: " << query_variant_total_num << endl;
+    cout << "parallel clustering..." << endl;
+    ParallelClustering();
+    cout << "matching variants..." << endl;
+    ClusteringMatchMultiThread();
+    // most clustering results are cleared inside ParallelClustering function except the following one
+    // which is needed for matching
+    variants_by_cluster.clear();
+    // clean at the end of function
+    for(int j = 0; j < chrom_num; j++){
+        que_variant_by_chrid[j]->clear();
+        //delete que_variant_by_chrid[j];
+    }
+    //delete[] que_variant_by_chrid;
+    query_variant_strings.clear();
+    query_variant_total_num = 0;
+    threshold_list.clear();
+    threshold_num = 0;
+    // The following three matching results are cleared inside ClusteringMatchMultiThread function
+    // match_records_by_mode_by_thread;
+    // baseline_total_match_num;
+    // query_total_match_num;
+    return;
+void WholeGenome::DirectMatch(string ref_vcf, string query_vcf)
+    //dout << "direct match" << endl;
+    int ref_variant_num = ReadReferenceVariants(ref_vcf);
+    int que_variant_num = ReadQueryVariants(query_vcf);
+    dout << ref_variant_num << "," << que_variant_num << endl;
+    int match_num = 0;
+    for(int i = 0; i < chrom_num; i++){
+        if(ref_variant_by_chrid[i]->size() == 0 || que_variant_by_chrid[i]->size() == 0)
+            continue;
+        //[TODO] not the right way to do it, at least need multimap
+        multimap<int, int> ref_variant_by_pos;
+        for(int j = 0; j < ref_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = ref_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            ref_variant_by_pos.insert(pair<int, int>(pos, j));
+        }
+        for(int j = 0; j < que_variant_by_chrid[i]->size(); j++){
+            DiploidVariant var = que_variant_by_chrid[i]->at(j);
+            int pos = var.pos;
+            if(ref_variant_by_pos.find(pos) == ref_variant_by_pos.end())
+                continue;
+            pair<multimap<int, int>::iterator, multimap<int, int>::iterator> var_range;
+            var_range = ref_variant_by_pos.equal_range(pos);
+            for(auto it = var_range.first; it != var_range.second; ++it){
+                int ref_index = (*it).second;
+                DiploidVariant ref_var = ref_variant_by_chrid[i]->at(ref_index);
+                if (match_mode_indicator != 1 && var == ref_var){
+                    match_num ++;
+                    break;
+                }else if(match_mode_indicator == 1 && var.CompareNoGenotype(ref_var)){
+                    match_num ++;
+                    break;
+                }
+            }
+        }
+    }
+    dout << "matched variants: " << match_num << endl;
diff --git a/src/wholegenome_working.h b/src/wholegenome_working.h
new file mode 100644
index 0000000..41a7cce
--- /dev/null
+++ b/src/wholegenome_working.h
@@ -0,0 +1,292 @@
+#pragma once
+#include <unordered_map>
+#include <algorithm>
+#include <vector>
+#include <map>
+#include <list>
+#include <tuple>
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <thread>
+#include <set>
+#include "util.h"
+#include "diploidvariant.h"
+//#include "tbb/task_scheduler_init.h"
+//#include "tbb/blocked_range.h"
+//#include "tbb/parallel_for.h"
+//#include "tbb/concurrent_vector.h"
+typedef struct VariantIndicator{
+    VariantIndicator(int chr_id_ = -1,
+    int var_id_ = -1,
+    bool refer_ = true) :
+    chr_id(chr_id_),
+    var_id(var_id_),
+    refer(refer_){}
+    char chr_id;
+    int var_id;
+    bool refer;
+typedef struct Interval {
+    int start;
+    int end;
+    Interval() : start(0), end(0) {}
+    Interval(int s, int e) : start(s), end(e) {}
+class SequencePath{
+    SequencePath(int n)
+    {
+        reference_length = n;
+        for(int i = 0; i < 4; i++){
+            string_sequences[i].resize(n, ".");
+            // default value is "."
+            donor_sequences[i] = "";
+        }
+        current_genome_pos = -1;
+        score = 0;
+        removable = false;
+        same_donor_len = false;
+        current_equal_donor_pos[0] = -1;
+        current_equal_donor_pos[1] = -1;
+        reached_sync_num = 0;
+    }
+    int reference_length;
+    vector<string> string_sequences[4];
+    map<int, pair<int, int>> choice_made[2]; // this can be used to indicate if choice is made and which choice
+    // one choice is a pair: variant id, phasing index
+    int current_genome_pos;
+    string donor_sequences[4];
+    int current_equal_donor_pos[2];
+    int score;
+    bool removable;
+    bool same_donor_len;
+    int reached_sync_num;
+class WholeGenome{
+    int chrom_num;
+    int thread_num;
+    string ref_vcf_filename;
+    string que_vcf_filename;
+    int baseline_variant_total_num;
+    int query_variant_total_num;
+    vector<string> baseline_variant_strings;
+    vector<string> query_variant_strings;
+    bool detail_results;
+    //int thread_num; VCF->DiploidVariant->WholeGenome
+    map<string, int> chrid_by_chrname;
+    map<int, string> chrname_by_chrid;
+    map<string, int> chrname_dict;
+    map<int, string> genome_sequences;
+    vector<DiploidVariant> ** ref_variant_by_chrid;
+    vector<DiploidVariant> ** que_variant_by_chrid;
+    vector<vector<VariantIndicator>> ** variant_cluster_by_chrid;
+    // so here cluster is represented as vector<vector<VariantIndicator>>
+    // and we create a list of pointers point to cluster
+    // and we hold the point to that list
+    vector<vector<VariantIndicator>> variants_by_cluster;
+    vector<string> *** match_records_by_mode_by_thread;
+    //vector<int> *** baseline_matches_by_mode_by_thread;
+    //vector<int> *** query_matches_by_mode_by_thread;
+    vector<int> *** baseline_total_match_num;
+    vector<int> *** query_total_match_num;
+    //map<float, int> *** tp_qual_num_by_mode_by_thread;
+    //map<float, int> *** fp_qual_num_by_mode_by_thread;
+    //map<float, int> query_total_qual_num;
+    string output_prefix;
+    string output_dir;
+    // copy the above into this.
+    int score_unit_indicator;
+    int match_mode_indicator;
+    int score_scheme_indicator;
+    vector<int> score_unit_list;
+    vector<int> match_mode_list;
+    vector<int> score_scheme_list;
+    vector<int> mode_index_list;
+    vector<double> threshold_list;
+    int threshold_num;
+    vector<float> per_list;
+    bool ReadWholeGenomeSequence(string filename);
+    bool ReadGenomeSequenceList(string filename);
+    int ReadWholeGenomeVariant(string filename, bool flag);
+    bool ReadVariantFileList(string filename);
+    int ReadReferenceVariants(string filename);
+    int ReadQueryVariants(string filename);
+    bool ParallelClustering(); // parallel by chr id
+    bool ParallelMatching(); // parallel by task
+    bool TBBMatching();
+    void SingleThreadClustering(int chr_id);
+    //bool MatchingSingleCluster(int cluster_index, int thread_index, int match_mode);
+    //override
+    bool ClusteringMatchInThread(int start, int end, int thread_index);
+    void ClusteringMatchMultiThread();
+    int NormalizeVariantSequence(int pos,
+                             string & parsimonious_ref,
+                             string & parsimonious_alt0,
+                             string & parsimonious_alt1,
+                             int chr_id);
+    struct compInterval {
+        bool operator()(const Interval &a, const Interval &b) const {
+            return a.start<b.start;
+        }
+    };
+    vector<Interval> merge(vector<Interval> &intervals) {
+        sort(intervals.begin(),intervals.end(),compInterval());
+        vector<Interval> results;
+        for(int i=0; i<intervals.size(); i++) {
+            if(results.empty() || results.back().end < intervals[i].start)  // no overlap
+                results.push_back(intervals[i]);
+            else   // overlap
+                results.back().end = max(results.back().end, intervals[i].end);
+        }
+        return results;
+    }
+    bool PathNeedDecision(SequencePath& sp, multimap<int, int> * choices_by_pos[], int pos);
+    int PathExtendOneStep(SequencePath& sp,
+                          multimap<int, int> * choices_by_pos[],
+                          const string & reference_sequence,
+                          vector<int> & sync_points,
+                          int match_mode);
+    bool PathMakeDecision(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    bool PathMakeDecisionBackup(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    bool MatchingSingleClusterBaseExtending(int cluster_index,
+                                            int thread_index,
+                                            vector<DiploidVariant> & variant_list,
+                                            string & subsequence,
+                                            int offset,
+                                            multimap<int, int> * choices_by_pos[],
+                                            vector<int> & sync_points,
+                                            int chr_id,
+                                            int score_unit,
+                                            int match_mode,
+                                            int score_scheme,
+                                            int threshold_index);
+    bool DonorLengthEqual(SequencePath & a, SequencePath & b);
+    void ConvergePaths(list<SequencePath> & path_list);
+    int CheckPathEqualProperty(SequencePath & sp, int match_mode);
+    int ScoreEditDistance(DiploidVariant & dv, int allele_indicator);
+    int EditDistance(const std::string& s1, const std::string& s2);
+    bool PathMakeDecisionNoGenotype(SequencePath& sp,
+                                 vector<DiploidVariant> & variant_list,
+                                 multimap<int, int> * choices_by_pos[],
+                                 list<SequencePath> & sequence_path_list,
+                                 const string & reference_sequence,
+                                 int score_unit,
+                                 int match_mode,
+                                 int score_scheme);
+    void ConstructMatchRecord(SequencePath & best_path,
+                               vector<DiploidVariant> & variant_list,
+                               string & subsequence,
+                               int offset,
+                               int thread_index,
+                               int chr_id,
+                               int mode_index,
+                               int threshold_index);
+    void ConstructMatchRecordNoGenotype(SequencePath & best_path,
+                                       vector<DiploidVariant> & variant_list,
+                                       string & subsequence,
+                                       int offset,
+                                       int thread_index,
+                                       int chr_id,
+                                       int mode_index,
+                                       int threshold_index);
+    int CalculateScore(DiploidVariant & dv,
+                       int choice,
+                       int score_unit,
+                       int match_mode,
+                       int score_scheme);
+    int GetIndexFromMatchScore(int score_unit, int match_mode, int score_scheme);
+    bool ClearQuery();
+    inline void ToUpper(string & s){
+        transform(s.begin(), s.end(), s.begin(), ::toupper);
+    }
+    bool CheckTandemRepeat(string sequence, int unit_threshold);
+    bool MatchVariantListInThread(int thread_index, 
+        int threshold_index,
+        int chr_id,
+        vector<DiploidVariant> & variant_list,
+        int cluster_id);
+    void initialize_score_matrix(int **score, char **trackBack, int M, int N);
+    int needleman_wunsch(string S1, string S2, string &R1, string &R2);
+    void GenerateAltVector(string ref, string alt, vector<string> & alt_vector);
+    WholeGenome(int thread_num_,
+                string output_dir_,
+                bool pr_curves_);
+    ~WholeGenome();
+    void ReadRef(string genome_seq, 
+      string ref_vcf);
+    void Compare(string query_vcf,
+        string output_prefix,
+        bool detail_results,
+        int score_unit_,
+        int match_mode_,
+        int score_scheme_);
+    void DirectMatch(string ref_vcf,
+                string query_vcf);
+    int test(); // for direct test
+    void PrintPath(SequencePath & sp);
+    const static int MATCH_MODE_NUM = 16;
+    const static int VAR_LEN = 100;
+    const static int MAX_REPEAT_LEN = 1000;
+    const static int ROC_SAMPLE_NUM = 5;
diff --git a/stat b/stat
new file mode 100644
index 0000000..c07befc
--- /dev/null
+++ b/stat
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+import sys
+from sys import argv
+import argparse
+citation = 'Please cite our paper'
+parser = argparse.ArgumentParser(epilog = citation)
+parser.add_argument('-s', metavar='simple.vcf', help='direct match vcf result')
+parser.add_argument('-c', metavar='complex.vcf', help='clustering match vcf result')
+if(args.s is not None):
+    print('Analysis direct match vcf results')
+    with open(args.s) as simple_vcf:
+        for line in simple_vcf.readlines():
+            if line.startswith('#'):
+                continue
+            columns = line.split('\t')
+            ref = 
diff --git a/vardiff b/vardiff
new file mode 100644
index 0000000..bc585f6
--- /dev/null
+++ b/vardiff
@@ -0,0 +1,299 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Paul Medvedev(pashadag at cse.psu.edu)
+    Chen Sun(chensun at cse.psu.edu)
+import sys
+import textwrap as _textwrap
+import argparse
+import os
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="vardiff", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('match_files', nargs='+', metavar='File', help='.match file list')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./match_diff_output')
+args = parser.parse_args()
+pos_donor_dict_list = []
+pos_content_dict_list = []
+html_head = """
+<style type="text/css">
+table, th, td {
+    border: 1px solid black;
+    border-collapse: collapse;
+th, td {
+    padding: 5px;
+    text-align: left;
+    padding: 0 20px 0;
+    margin: 20px 0;
+    line-height: 40px;
+    border-left: 200px solid #ddd;
+    border-right: 200px solid #ddd;
+    text-align: center;
+    font-size:100%;
+    font-family: "Courier New", Courier, monospace !important;
+    font-size:100%;
+    line-height:0px;
+    white-space:nowrap;
+    color: blue;
+    color: gray;
+    border-left: 4px solid #ccc;
+    border-top: 4px solid #ccc;
+    border-bottom: 4px solid #ccc;
+    padding:20px;
+    margin-top: 10px;
+    margin-bottom: 100px;
+    margin-left:10px;
+    display:none;
+    display:block;
+def read_match_file(filename):
+    pos_set = set()
+    pos_donor_dict = {}
+    pos_content_dict = {}
+    with open(filename) as file:
+        for line in file:
+            if line.startswith('#'):
+                continue
+            columns = line.split('\t')
+            if len(columns) < 2:
+                print line
+            pos = columns[0]+'_'+columns[1]
+            pos_set.add(pos)
+            donor = columns[3]
+            pos_donor_dict[pos] = donor
+            pos_content_dict[pos] = line
+    return pos_donor_dict, pos_content_dict, pos_set
+def levenshtein(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein(s2, s1)
+    # len(s1) >= len(s2)
+    if len(s2) == 0:
+        return len(s1)
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
+            deletions = current_row[j] + 1       # than s2
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+    return previous_row[-1]
+def separate_string(s):
+    l = list(s)
+    return ' '.join(l)
+def parse_variant(offset, variant_string, phasing_string, is_baseline):
+    variant_content_list = ['<hr>\n']
+    variant_component = variant_string.split(',')
+    variant_pos = int(variant_component[0])
+    variant_prefix = ' '*(variant_pos - offset)
+    variant_ref = variant_component[1]
+    variant_alts = variant_component[2].split('/')
+    phasing_component = phasing_string.split('|')
+    if is_baseline:
+        if '0' in phasing_component:
+            variant_content_list.append('<pre class="selected">Baseline REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+        else:
+            variant_content_list.append('<pre>Baseline REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+    else:
+        if '0' in phasing_component:
+            variant_content_list.append('<pre class="selected">Query    REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+        else:
+            variant_content_list.append('<pre>Query    REF:' + separate_string(variant_prefix + variant_ref) + '</pre>\n')
+    for i in range(2):
+        j = i
+        #print variant_alts, phasing_component
+        if len(variant_alts) < 2:
+            j = 0
+        if str(j+1) in phasing_component:
+            variant_content_list.append('<pre class="selected">         ALT:' + separate_string(variant_prefix + variant_alts[j]) + '</pre>\n')
+        else:
+            variant_content_list.append('<pre class="discard">         ALT:' + separate_string(variant_prefix + variant_alts[j]) + '</pre>\n')
+    return variant_content_list
+def parse_match(match_string, filename):
+    matching_content_list = ['<div class="separate_line" >' + filename +'</div>\n']
+    match_columns = match_string.split('\t')
+    offset = int(match_columns[1])
+    match_ref = match_columns[2]
+    match_donors = match_columns[3].split('/')
+    matching_content_list.append('<pre>Genome   Ref:'+ separate_string(match_ref) +'</pre>\n')
+    matching_content_list.append('<pre>     Donor 0:'+ separate_string(match_donors[0]) +'</pre>\n')
+    if len(match_donors) > 1:
+        matching_content_list.append('<pre>     Donor 1:'+ separate_string(match_donors[1]) +'</pre>\n')
+    else:
+        matching_content_list.append('<pre>     Donor 1:'+ separate_string(match_donors[0]) +'</pre>\n')
+    if match_columns[4] == '.':
+        return matching_content_list
+    for i in range(2):
+        matching_variants = match_columns[4+i].split(';')
+        matching_phasing = match_columns[6+i].split(';')
+        is_baseline = True
+        if i == 1:
+            is_baseline = False
+        for k in range(len(matching_variants)):
+            matching_content_list += parse_variant(offset, matching_variants[k], matching_phasing[k], is_baseline)
+    return matching_content_list
+def main():
+    if not os.path.exists(args.output):
+        os.mkdir(args.output)
+    match_file_list = args.match_files
+    match_file_num = len(match_file_list)
+    diff_filename_list = []
+    diff_content_table = [[] for i in range(match_file_num)]
+    union_pos_set = set()
+    for match_file in match_file_list:
+        (pos_donor_dict, pos_content_dict, pos_set) = read_match_file(match_file)
+        #print pos_set
+        pos_donor_dict_list.append(pos_donor_dict)
+        pos_content_dict_list.append(pos_content_dict)
+        union_pos_set.update(pos_set)
+        match_basename = os.path.basename(match_file)
+        diff_filename_list.append(args.output + '/' + match_basename + '.diff')
+    union_pos_list = list(union_pos_set)
+    union_pos_list.sort()
+    #print union_pos_list
+    # think specifically for two VCF files
+    # output detail
+    compare_filename = args.output + '/' + 'compare.html'
+    compare_file = open(compare_filename, 'w')
+    compare_file.write(html_head)
+    for pos in union_pos_list:
+        have_diff = False
+        for i in range(match_file_num):
+            if pos not in pos_donor_dict_list[i]:
+                have_diff = True
+            break
+        #print have_diff
+        if not have_diff:
+            donor_benchmark = pos_donor_dict_list[0][pos]
+            for i in range(match_file_num):
+                #print pos, donor_benchmark, pos_donor_dict[i][pos]
+                if pos_donor_dict_list[i][pos] != donor_benchmark:
+                    have_diff = True
+                    break
+        if have_diff:
+            compare_file.write('<div class="box">\n')
+            for i in range(match_file_num):
+                if pos in pos_content_dict_list[i]:
+                    diff_content_table[i].append(pos_content_dict_list[i][pos])
+                    for content in parse_match(pos_content_dict_list[i][pos], match_file_list[i]):
+                        compare_file.write(content)
+            compare_file.write('</div>\n')
+    for i in range(match_file_num):
+        with open(diff_filename_list[i], 'w') as diff_file:
+            for diff_content in diff_content_table[i]:
+                diff_file.write(diff_content + '\n')
+    compare_file.write(html_tail)
+    compare_file.close()
+if __name__ == '__main__':
+    main()
diff --git a/varmatch b/varmatch
new file mode 100755
index 0000000..ab6757f
--- /dev/null
+++ b/varmatch
@@ -0,0 +1,587 @@
+#!/usr/bin/env python
+# Copyright 2015, Chen Sun
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    Authors:
+    Chen Sun(chensun at cse.psu.edu)
+    Paul Medvedev(pashadag at cse.psu.edu)
+import sys
+versionError = "You are using an old version of python, please upgrade to python 2.7+\n"
+if sys.hexversion < 0x02070000:
+    print (versionError)
+    exit()
+import textwrap as _textwrap
+import multiprocessing
+import argparse
+import os
+import subprocess
+import time
+RUN = True
+author_email = 'chensun at cse.psu.edu'
+class SmartFormatter(argparse.HelpFormatter):
+    def _split_lines(self, text, width):
+        paragraphs = text.split('\n')
+        #return paragraphs
+        multiline_text = []
+        for paragraph in paragraphs:
+            formatted_paragraph = _textwrap.wrap(paragraph, width)
+            multiline_text = multiline_text + formatted_paragraph
+        return multiline_text
+    def _fill_text(self, text, width, indent):
+        return ''.join(indent + line for line in text.splitlines(True))
+citation = 'Please cite our paper.'
+parser = argparse.ArgumentParser(prog="varmatch", epilog = citation, formatter_class=lambda prog: SmartFormatter(prog,max_help_position=8))
+parser.add_argument('-b', '--baseline', required=True, metavar='File', help = 'baseline variant VCF filename')
+parser.add_argument('-q', '--query', nargs='+', metavar='File List', help = 'query variant VCF filename')
+parser.add_argument('-g', '--genome', required=True, metavar='File', help= 'genome sequence FASTA filename')
+parser.add_argument('-o', '--output', help='output directory', metavar='DIRECTORY',default='./output')
+thread_string = "number of threads, default is the number of available cores (For this machine:" + str(multiprocessing.cpu_count()) + \
+                ")\nIf larger than number of available cores or less than 1, automatically set to default value"
+parser.add_argument('-t', '--thread', metavar="INT", help=thread_string, default=str(multiprocessing.cpu_count()))
+score_unit_string = "scoring function/score unit: (Default: -1)\n"\
+    "-1 : iterate both 0 and 1.\n"\
+    "0  : the score that a VCF entry contributes is 1.\n"\
+    "1  : the score that a VCF entry contributes is the edit distance between the new allele and the reference one.\n"
+match_mode_string = "matching mode: (Default: -1)\n"\
+    "-1 : iterate both 0 and 1.\n"\
+    "0  : a set of query entries match a set of baseline entries if, "\
+    "for each entry, we can select one of the alleles such that the inferred sequences are identical\n"\
+    "1  : a set of query entries match a set of baseline entries if there exist a phasing of each set such that "\
+    "the two inferred haplotypes from the query are equal to the two inferred haplotypes from the baseline.\n"
+score_scheme_string = "scoring scheme: (Default: -1)\n"\
+    "-1 : iterate 0, 1, and 2 (excluding 3)\n"\
+    "0  : find two subsets of non-overlapping equivalent variants such that "\
+    "the score of the matched variants is maximized \n"\
+    "1  : find two subsets of non-overlapping equivalent variants such that"\
+    " the score of the chosen baseline variants is maximized\n"\
+    "2  : find a maximum scoring set of variants in the query such that"\
+    " each variant can be matched by a subset of the baseline variants\n"\
+    "3  : (1 to 1 direct match) find a maximum scoring set of entry pairs such that each entry pair contains"\
+    " one query and one baseline variant that result in the same sequence."\
+    " In this scheme, different scoring functions and "\
+    "matching mode have no difference.\n"
+parser.add_argument('-u', '--score_unit', help=score_unit_string, metavar='[-1,0,1]', default=-1)
+parser.add_argument('-m', '--match_mode', help=match_mode_string, metavar='[-1,0,1]', default=-1)
+parser.add_argument('-s', '--score_scheme', help=score_scheme_string, metavar='[-1,0,1,2,3]', default=-1)
+parser.add_argument('-G', '--no_graph', help='disable graphic module', action = 'store_true')
+disable_curves_string = "disable Precision-Recall curves, if use -G or --no_graph,"\
+                        " then automatically disable these curves"
+parser.add_argument('-C', '--disable_curves', help=disable_curves_string, action='store_true')
+fast_mode_string = "In this mode, automatically disable graphic module and precision-recall curves,"\
+                   " only performs one matching criterion.\n"\
+                   " Fast mode is equivalent to use following parameters compulsively: -G -u 0 -m 0 -s 0"
+parser.add_argument('-f', '--fast_mode', help=fast_mode_string, action='store_true')
+args = parser.parse_args()
+    args.no_graph = True
+    args.score_unit = 0
+    args.match_mode = 0
+    args.score_scheme = 0
+def shell_run(command, hide=False):
+    if not RUN:
+        time.sleep(3.5)
+        print(command)
+    else:
+        print(command)
+        if hide:  # hide output
+            FNULL = open(os.devnull, 'w')
+            subprocess.call(command, shell=True, stdout=FNULL, stderr=subprocess.STDOUT)
+            # subprocess.call(command, shell=True, stdout=FNULL)
+            FNULL.close()
+        else:
+            subprocess.call(command, shell=True)
+def check_command(command):
+    """
+    check if corresponding command available
+    """
+    if os.path.isfile(command):
+        return True
+    for cmdpath in os.environ['PATH'].split(':'):
+        if os.path.isdir(cmdpath) and command in os.listdir(cmdpath):
+            return True
+    return False
+def table_2_html(table):
+    html = '<table border=.5>'
+    for i in range(len(table)):
+        if i == 0:
+            html += '<tr><th>' + '</th><th>'.join(table[i]) + '</th></tr>'
+        else:
+            html += '<tr><td>' + '</td><td>'.join(table[i]) + '</td></tr>'
+    html += '</table>'
+    return html
+html_head = """
+<style type="text/css">
+table, th, td {
+    border: 1px solid black;
+    border-collapse: collapse;
+th, td {
+    padding: 5px;
+    text-align: left;
+    border: 4px solid #ccc;
+    padding:20px;
+    margin:10px 100px 100px 10px;
+    display:none;
+    display:block;
+marker_list = ['o', 'v', '1', '8', 's', 'p', '*', 'h', 'x', 'D']
+def multiple_compare(baseline_file, query_list, genome_file):
+    global check_compare_command
+    global output_dir
+    if not check_compare_command and not check_command(compare_tool):
+        print ('Error: can not find program: ' + compare_tool)
+        print ('\t Try "make" command before execute, or contact author for support: ' + author_email)
+        exit()
+    else:
+        check_compare_command = True
+    compare_command = compare_tool + ' -b ' + baseline_file + ' -g ' + genome_file + ' -o ' + output_dir
+    for query_file in query_list:
+        compare_command += ' -q ' + query_file
+    if args.thread is not None and int(args.thread) > 0:
+        compare_command += ' -t ' + args.thread
+    compare_command += ' -u ' + str(args.score_unit) + ' -m ' + str(args.match_mode) + ' -s ' + str(args.score_scheme)
+    if args.no_graph or args.disable_curves:
+        compare_command += ' -C '
+    shell_run(compare_command)
+def varmatch_pairwise(baseline_file, query_file, genome_file):
+    global output_dir
+    ref_basename = os.path.basename(baseline_file)
+    que_basename = os.path.basename(query_file)
+    output_prefix = output_dir + '/' + ref_basename + '_' + que_basename
+    #pairwise_compare(baseline_file, query_file, genome_file)
+    return output_prefix
+def create_table_prefx(score_unit, match_mode, score_scheme):
+    matching_id = ''
+    score_unit_string = 'Unit Cost[U]'
+    match_mode_string = 'Genotype[G]'
+    score_scheme_string = 'Total[T]'
+    if(score_unit == '0'):
+        matching_id += 'U'
+    else:
+        matching_id += 'E'
+        score_unit_string = 'Edit Distance[E]'
+    if match_mode == '0':
+        matching_id += 'G'
+    else:
+        matching_id += 'V'
+        match_mode_string = 'Variant[V]'
+    if score_scheme == '0':
+        matching_id += 'T'
+    elif score_scheme == '1':
+        matching_id += 'B'
+        score_scheme_string = 'Baseline[B]'
+    elif score_scheme == '2':
+        matching_id += 'Q'
+        score_scheme_string = 'Query[Q]'
+    return [matching_id, score_unit_string, match_mode_string, score_scheme_string]
+def parse_stat(output_prefix):
+    global output_dir
+    stat_filename = output_dir + '/' + output_prefix + '.stat'
+    no_filter_table = []
+    head = ['Matching Id', 'Score Unit', 'Match Mode', 'Score Scheme', 'Baseline Match Number', 'Query Match Number', 'Recall(%)', 'Precision(%)']
+    match_id = 0
+    no_filter_table.append(head)
+    x = [] # matching id list
+    y = [] # sensitivity list
+    z = [] # specificity list
+    sensitivity_table = []
+    specificity_table = []
+    baseline_num = 0.
+    query_num = 0.
+    with open(stat_filename) as stat_file:
+        for line in stat_file.readlines():
+            line = line.strip()
+            if line.startswith('##'):
+                columns = line.split(':')
+                if columns[0] == '##Baseline':
+                    baseline_num = float(columns[1])
+                else:
+                    query_num = float(columns[1])
+            if line.startswith('#'):
+                continue
+            match_id += 1
+            temp = line.split('\t')
+            row = create_table_prefx(temp[0], temp[1], temp[2])
+            baseline_match_str_list = temp[4].split(',')
+            query_match_str_list = temp[5].split(',')
+            query_total_str_list = temp[6].split(',')
+            baseline_match_str = baseline_match_str_list[0]
+            query_match_str = query_match_str_list[0]
+            sensitivity_list = []
+            specificity_list = []
+            tn_list = []
+            for baseline_match_num in baseline_match_str_list:
+                sensitivity = float(baseline_match_num) * 100 / baseline_num # this is actually recall
+                sensitivity_list.append(sensitivity)
+            #for query_match_num in query_match_str_list:
+            #    specificity = float(query_match_num) * 100 / query_num # this is actually precison
+            #    specificity_list.append(specificity)
+            for i in range(len(query_match_str_list)):
+                specificity = float(query_match_str_list[i]) * 100 / float(query_total_str_list[i])
+                specificity_list.append(specificity)
+            x.append(row[0])
+            row += [baseline_match_str, query_match_str, "%.3f" % sensitivity_list[0], "%.3f" % specificity_list[0]]
+            y.append(sensitivity_list[0])
+            z.append(specificity_list[0])
+            sensitivity_table.append(sensitivity_list)
+            specificity_table.append(specificity_list)
+            no_filter_table.append(row)
+    return baseline_num, query_num, x, y, z, no_filter_table, sensitivity_table, specificity_table
+def create_table_by_matchingid_from_by_query(table_list, matching_list, query_number):
+    table_by_matchingid = []
+    for matching_index in range(len(matching_list)):
+        matching_table = []
+        title = ['Query Id', 'Baseline Match Number', 'Query Match Number', 'Recall(%)', 'Precision(%)']
+        matching_table.append(title)
+        for table_index in range(len(table_list)):
+            raw_row = table_list[table_index][matching_index]
+            new_row = ['Query' + str(table_index+1)]
+            new_row += raw_row[4:]
+            matching_table.append(new_row)
+        table_by_matchingid.append(matching_table)
+    return table_by_matchingid
+# all html and picture are created from stat file, not parameters
+def create_stat_html(query_list, output_prefix_list):
+    global output_dir
+    html_filename = output_dir + '/stat.html'
+    html_file = open(html_filename, 'w')
+    html_file.write(html_head)
+    html_file.write('<h1>VarMatch Report</h1>')
+    html_file.write('<p>precison and recall analysis for each query with variant quality ≥ 0</p>')
+    exp_num = len(output_prefix_list)
+    baseline_num_list = []
+    query_num_list = []
+    table_list = []
+    label_list = []
+    sensitivity_list = []
+    specificity_list = []
+    sensitivity_table_list = []
+    specificity_table_list = []
+    for output_prefix in output_prefix_list:
+        (baseline_num, query_num, x, y, z, table, sensitivity_table, specificity_table) = parse_stat(output_prefix)
+        baseline_num_list.append(int(baseline_num))
+        query_num_list.append(int(query_num))
+        label_list.append(x)
+        sensitivity_list.append(y)
+        specificity_list.append(z)
+        table_list.append(table)
+        print sensitivity_table
+        print specificity_table
+        sensitivity_table_list.append(sensitivity_table)
+        specificity_table_list.append(specificity_table)
+    if(len(table_list)) == 0:
+        html_file.close()
+        return
+    if not args.no_graph:
+        import numpy as np
+        import matplotlib
+        matplotlib.use('Agg')
+        import matplotlib.pyplot as plt
+        axes = plt.gca()
+        #axes.set_xlim([xmin,xmax])
+        axes.set_ylim([0,100])
+        for i in range(exp_num):
+            marker_id = i % len(marker_list)
+            marker_sign = marker_list[marker_id]
+            label_sign = 'Query ' + str(i+1)
+            x = np.array(range(len(label_list[0])))
+            plt.xticks(x, label_list[0])
+            plt.plot(x, sensitivity_list[i], marker = marker_sign, linestyle = '-', label = label_sign)
+        plt.xlabel('Matching Id')
+        plt.ylabel('Recall(%)')
+        #plt.title('Sensitivity of Queries under Different Matching Parameters')
+        plt.legend(loc='best')
+        plt.savefig(output_dir + '/sensitivity.png')
+        plt.clf() # clear figure for the next
+        axes = plt.gca()
+        #axes.set_xlim([xmin,xmax])
+        axes.set_ylim([0,100])
+        for i in range(exp_num):
+            marker_id = i % len(marker_list)
+            marker_sign = marker_list[marker_id]
+            label_sign = 'Query ' + str(i+1)
+            x = np.array(range(len(label_list[0])))
+            plt.xticks(x, label_list[0])
+            plt.plot(x, specificity_list[i], marker = marker_sign, linestyle = '-', label = label_sign)
+        plt.xlabel('Matching Id')
+        plt.ylabel('Precision(%)')
+        #plt.title('Specificity of Queries under Different Matching Parameters')
+        plt.legend(loc='best')
+        plt.savefig(output_dir + '/specificity.png')
+        html_file.write('<h2>VarMatch Matching Parameters Table</h2>'+'\n')
+        parameter_table = []
+        temp_table = table_list[0]
+        for row in temp_table:
+            parameter_table.append(row[:4])
+        html_file.write(table_2_html(parameter_table))
+        html_file.write('<h2>Sensitivity and Specificity of Queries under Different Matching Parameters</h2>'+'\n')
+        html_file.write('<p> Baseline File: ' + args.baseline+'</p>' + '\n')
+        for i in range(exp_num):
+            html_file.write('<p> Query ' + str(i+1) + ': ' + query_list[i] + '</p>' + '\n')
+        html_file.write('<h3>Recall of Queries under Different Matching Parameters</h3>'+'\n')
+        html_file.write('<img src="sensitivity.png" alt="Sensitivity Graph Not Found...">'+'\n')
+        html_file.write('<h3>Precison of Queries under Different Matching Parameters</h3>'+'\n')
+        html_file.write('<img src="specificity.png" alt="Specificity Graph Not Found...">'+'\n')
+    # sensitivity and specificity analysis by query
+    html_file.write('<h2>Sensitivity and Specificity Analysis by Query</h2>'+'\n')
+    for i in range(exp_num):
+        html_file.write('<div class="box">')
+        html_file.write('<h3>Query File: ' + query_list[i] + '</h3>'+'\n')
+        html_file.write('<p> Number of Variants in Baseline: ' + str(baseline_num_list[i]) + '</p>'+'\n')
+        html_file.write('<p> Number of Variants in Query: ' + str(query_num_list[i]) + '</p>'+'\n')
+        html_file.write(table_2_html(table_list[i]))
+        html_file.write('</div>'+'\n')
+    if exp_num > 1:
+        # sensitivity and specificity analysis by matching id
+        html_file.write('<h2><Sensitivity and Specificity Analysis by Matching Id/h2>')
+        table_by_matchingid = create_table_by_matchingid_from_by_query(table_list, label_list[0], exp_num)
+        for i in range(len(label_list[0])):
+            html_file.write('<div class="box">')
+            html_file.write('<h3>Matching Id: ' + label_list[0][i] + '</h3>'+'\n')
+            html_file.write(table_2_html(table_by_matchingid[i]))
+            html_file.write('</div>'+'\n')
+        html_file.write(html_tail)
+        html_file.close()
+    # create roc html
+    if args.no_graph or args.disable_curves:
+        return
+    html_filename = output_dir + '/precision_recall.html'
+    html_file = open(html_filename, 'w')
+    html_file.write(html_head)
+    html_file.write('<h1>VarMatch Precision-Recall Curves</h1>')
+    html_file.write('<h2>VarMatch Matching Parameters Table</h2>'+'\n')
+    parameter_table = []
+    temp_table = table_list[0]
+    for row in temp_table:
+        parameter_table.append(row[:4])
+    html_file.write(table_2_html(parameter_table))
+    for i in range(exp_num):
+        html_file.write('<p>Query ' + str(i+1) + ': ' + query_list[i] + '</p>' + '\n')
+    html_file.write('<h2>Precision-Recall Curve by Matching Id</h2>')
+    html_file.write('<p>For each matching id, compare all queries in one graph</p>')
+    for i in range(len(parameter_table)-1):
+        html_file.write('<h3>Precision-Recall Curve for Parameter '+parameter_table[i+1][0]+'</h3>'+'\n')
+        plt.clf()
+        for j in range(exp_num):
+            x = sensitivity_table_list[j][i]
+            y = specificity_table_list[j][i]
+            x[:] = [a/100 for a in x]
+            #y.reverse()
+            y[:] = [a/100 for a in y]
+            x.sort()
+            y.sort(reverse=True)
+            y = y[::-1]
+            label_sign = 'Query ' + str(j+1)
+            plt.plot(x,y, label = label_sign)
+        #x = [0.0, 1.0]
+        #plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')
+        plt.xlim(0.0, 1.0)
+        plt.ylim(0.0, 1.0)
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.legend(loc='best')
+        plt.tight_layout()
+        plt.savefig(output_dir + '/parameter' + str(i)+'.roc.png')
+        html_file.write('<img src="parameter'+str(i) + '.roc.png'+'" alt="ROC Curve Not Found">\n')
+    html_file.write('<h2>Precision-Recall Curve by Query</h2>')
+    html_file.write('<p>For each query, compare all matching id in one graph</p>')
+    for i in range(exp_num):
+        html_file.write('<h3>Precision-Recall Curve for Query '+str(i+1)+'</h3>'+'\n')
+        plt.clf()
+        colormap = plt.cm.gist_ncar
+        plt.gca().set_color_cycle([colormap(k) for k in np.linspace(0, 0.9, len(parameter_table))])
+        for j in range(len(parameter_table)-1):
+            x = sensitivity_table_list[i][j]
+            y = specificity_table_list[i][j]
+            #x[:] = [1.0 - a/100 for a in x]
+            #y.reverse()
+            #y[:] = [a/100 for a in y]
+            #x.sort()
+            #y.sort(reverse=True)
+            label_sign = parameter_table[j+1][0]
+            plt.plot(x,y, label = label_sign)
+        #x = [0.0, 1.0]
+        #plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='random')
+        plt.xlim(0.0, 1.0)
+        plt.ylim(0.0, 1.0)
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.legend(loc='best')
+        plt.tight_layout()
+        plt.savefig(output_dir + '/query' + str(i)+'.roc.png')
+        html_file.write('<img src="query'+str(i) + '.roc.png'+'" alt="ROC Curve Not Found">\n')
+    html_file.write(html_tail)
+    html_file.close()
+def main():
+    if len(sys.argv) < 2:
+        parser.print_help()
+        exit()
+    global check_compare_command
+    global script_path
+    global compare_tool
+    global output_dir
+    global temp_dir
+    check_compare_command = True
+    script_path = sys.path[0]
+    compare_tool = script_path + '/vm-core'
+    output_dir = ''
+    temp_dir = ''
+    # create output directory
+    if args.output is None or args.output == '':
+        output_dir = os.getcwd() + '/output'
+    else:
+        output_dir = args.output
+    if output_dir == '':
+        output_dir = os.getcwd() + '/output'
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    temp_dir = output_dir + '/temp'
+    query_list = args.query
+    multiple_compare(args.baseline, query_list, args.genome)
+    output_prefix_list = []
+    for i in range(len(query_list)):
+        output_prefix_list.append('query'+str(i+1))
+    create_stat_html(query_list, output_prefix_list)
+if __name__ == '__main__':
+    main()
diff --git a/xx.png b/xx.png
new file mode 100644
index 0000000..29fc8fa
Binary files /dev/null and b/xx.png differ

Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/varmatch.git

More information about the debian-med-commit mailing list