[med-svn] [falcon] 01/05: Imported Upstream version 1.8.4

Fri Dec 2 08:49:43 UTC 2016

This is an automated email from the git hooks/post-receive script.

afif pushed a commit to branch master
in repository falcon.

commit 54d9a95edd5ecc224ac237e62d79844ee1524ef6
Author: Afif Elghraoui <afif at debian.org>
Date:   Fri Dec 2 00:12:48 2016 -0800

    Imported Upstream version 1.8.4
---
 DALIGNER/LAmerge.c                                 |    2 +-
 DAZZ_DB/GNUmakefile                                |    2 +-
 FALCON/falcon_kit/mains/get_read_ctg_map.py        |   12 +-
 FALCON/falcon_kit/mains/run1.py                    |   50 +-
 FALCON_unzip/LICENSE.txt                           |   32 +
 FALCON_unzip/README.md                             |    1 +
 FALCON_unzip/examples/fc_unzip.cfg                 |   17 +
 FALCON_unzip/examples/unzip.sh                     |    2 +
 FALCON_unzip/examples/unzip_obsoleted.sh           |   33 +
 FALCON_unzip/falcon_unzip/__init__.py              |   36 +
 FALCON_unzip/falcon_unzip/dedup_h_tigs.py          |   70 +
 FALCON_unzip/falcon_unzip/get_read_hctg_map.py     |  110 ++
 FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py      |  678 +++++++++
 .../falcon_unzip/ovlp_filter_with_phase.py         |  354 +++++
 FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py  | 1545 ++++++++++++++++++++
 FALCON_unzip/falcon_unzip/phasing.py               |  569 +++++++
 FALCON_unzip/falcon_unzip/phasing_readmap.py       |   73 +
 FALCON_unzip/falcon_unzip/rr_hctg_track.py         |  201 +++
 FALCON_unzip/falcon_unzip/run_quiver.py            |  394 +++++
 FALCON_unzip/falcon_unzip/select_reads_from_bam.py |  113 ++
 FALCON_unzip/falcon_unzip/unzip.py                 |  357 +++++
 FALCON_unzip/setup.py                              |   24 +
 FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py     |    4 +
 .../src/py_scripts/fc_get_read_hctg_map.py         |    4 +
 FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py |    4 +
 .../src/py_scripts/fc_ovlp_filter_with_phase.py    |    6 +
 .../src/py_scripts/fc_phased_ovlp_to_graph.py      |    5 +
 FALCON_unzip/src/py_scripts/fc_phasing.py          |    5 +
 FALCON_unzip/src/py_scripts/fc_phasing_readmap.py  |    4 +
 FALCON_unzip/src/py_scripts/fc_quiver.py           |    4 +
 FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py    |    4 +
 .../src/py_scripts/fc_select_reads_from_bam.py     |    4 +
 .../src/py_scripts/fc_track_reads_htigs0.py        |  398 +++++
 FALCON_unzip/src/py_scripts/fc_unzip.py            |    4 +
 FALCON_unzip/src/py_utils/align.sh                 |   10 +
 FALCON_unzip/src/py_utils/call_SV_from_mum.py      |   85 ++
 FALCON_unzip/src/py_utils/fetch_read.py            |   12 +
 FALCON_unzip/src/py_utils/split_bam.py             |   28 +
 pypeFLOW/pwatcher/fs_based.py                      |    2 +-
 pypeFLOW/pwatcher/network_based.py                 |    2 +-
 pypeFLOW/pypeflow/simple_pwatcher_bridge.py        |   10 +-
 41 files changed, 5213 insertions(+), 57 deletions(-)

diff --git a/DALIGNER/LAmerge.c b/DALIGNER/LAmerge.c
index 6c1f3dc..ef443cf 100644
--- a/DALIGNER/LAmerge.c
+++ b/DALIGNER/LAmerge.c
@@ -168,7 +168,7 @@ static void ovl_reload(IO_block *in, int64 bsize)
 
   remains = in->top - in->ptr;
   if (remains > 0)
-    memcpy(in->block, in->ptr, remains);
+    memmove(in->block, in->ptr, remains);
   in->ptr  = in->block;
   in->top  = in->block + remains;
   in->top += fread(in->top,1,bsize-remains,in->stream);
diff --git a/DAZZ_DB/GNUmakefile b/DAZZ_DB/GNUmakefile
index de52eb3..2c11aff 100644
--- a/DAZZ_DB/GNUmakefile
+++ b/DAZZ_DB/GNUmakefile
@@ -36,6 +36,6 @@ clean:
 	rm -f DBupgrade.Sep.25.2014 DBupgrade.Dec.31.2014 DUSTupgrade.Jan.1.2015
 	rm -f dazz.db.tar.gz
 
-SRCS:=$(notdir $(wildcard ${THISDIR}/*.c))
+SRCS:=DB.c QV.c
 DEPS:=$(patsubst %.c,%.d,${SRCS})
 -include ${DEPS}
diff --git a/FALCON/falcon_kit/mains/get_read_ctg_map.py b/FALCON/falcon_kit/mains/get_read_ctg_map.py
index 349322f..91470ff 100644
--- a/FALCON/falcon_kit/mains/get_read_ctg_map.py
+++ b/FALCON/falcon_kit/mains/get_read_ctg_map.py
@@ -1,7 +1,4 @@
 from __future__ import absolute_import
-#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
-#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
-#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
 from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
         makePypeLocalFile, fn, PypeTask)
 PypeThreadTaskBase = MyFakePypeThreadTaskBase
@@ -84,8 +81,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
     task = PypeTask(
               inputs = {'rawread_db': rawread_db},
               outputs = {'rawread_id_file': rawread_id_file},
-              TaskType = PypeThreadTaskBase,
-              URL = 'task://localhost/dump_rawread_ids')
+    )
     wf.addTask(task(dump_rawread_ids))
 
     pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
@@ -94,8 +90,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
     task = PypeTask(
                inputs = {'pread_db': pread_db},
                outputs = {'pread_id_file': pread_id_file},
-               TaskType = PypeThreadTaskBase,
-               URL = 'task://localhost/dump_pread_ids' )
+    )
     wf.addTask(task(dump_pread_ids))
 
     wf.refreshTargets() # block
@@ -115,8 +110,7 @@ def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
     task = PypeTask(
               inputs = inputs,
               outputs = {'read_to_contig_map': read_to_contig_map},
-              TaskType = PypeThreadTaskBase,
-              URL = 'task://localhost/get_ctg_read_map')
+    )
     wf.addTask(task(generate_read_to_ctg_map))
 
     wf.refreshTargets() # block
diff --git a/FALCON/falcon_kit/mains/run1.py b/FALCON/falcon_kit/mains/run1.py
index 8fd7352..d7bb13d 100644
--- a/FALCON/falcon_kit/mains/run1.py
+++ b/FALCON/falcon_kit/mains/run1.py
@@ -1,9 +1,6 @@
 from .. import run_support as support
 from .. import bash, pype_tasks
 from ..util.system import only_these_symlinks
-#from pypeflow.pwatcher_bridge import PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase
-#from pypeflow.data import makePypeLocalFile, fn
-#from pypeflow.task import PypeTask
 from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
         makePypeLocalFile, fn, PypeTask)
 import argparse
@@ -34,8 +31,6 @@ def create_daligner_tasks(basedir, scatter_fn):
         make_daligner_task = PypeTask(inputs = inputs,
                                       outputs = outputs,
                                       parameters = parameters,
-                                      TaskType = MyFakePypeThreadTaskBase,
-                                      URL = URL,
                                       wdir = wdir,
         )
         daligner_task = make_daligner_task(pype_tasks.task_run_daligner)
@@ -60,8 +55,6 @@ def create_merge_tasks(basedir, scatter_fn):
         make_task = PypeTask(inputs = inputs,
                              outputs = outputs,
                              parameters = parameters,
-                             TaskType = MyFakePypeThreadTaskBase,
-                             URL = URL,
                              wdir = wdir,
         )
         task = make_task(pype_tasks.task_run_las_merge)
@@ -86,8 +79,6 @@ def create_consensus_tasks(basedir, scatter_fn):
         make_c_task = PypeTask(inputs = inputs,
                                outputs = outputs,
                                parameters = parameters,
-                               TaskType = MyFakePypeThreadTaskBase,
-                               URL = URL,
                                wdir = wdir,
         )
         c_task = make_c_task(pype_tasks.task_run_consensus)
@@ -103,9 +94,7 @@ def create_merge_gather_task(wd, inputs):
                          outputs =  {'las_fofn': las_fofn_plf,
                                      'las_fopfn': las_fopfn_plf,
                          },
-                         TaskType = MyFakePypeThreadTaskBase,
     )
-    #                     URL = 'task://localhost/pmerge_gather')
     task = make_task(pype_tasks.task_merge_gather)
     return task, las_fofn_plf, las_fopfn_plf
 
@@ -116,8 +105,7 @@ def create_consensus_gather_task(wd, inputs):
     make_cns_gather_task = PypeTask(
                 inputs = inputs, # consensus_out
                 outputs =  {'preads_fofn': preads_fofn_plf},
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/cns_gather' )
+    )
     task = make_cns_gather_task(pype_tasks.task_cns_gather)
     return task, preads_fofn_plf
 
@@ -170,7 +158,7 @@ def run(wf, config,
     make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf},
                                   outputs = {'o_fofn': rawread_fofn_plf},
                                   parameters = {},
-                                  TaskType = MyFakePypeThreadTaskBase)
+    )
     fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)
 
     wf.addTasks([fofn_abs_task])
@@ -195,7 +183,7 @@ def run(wf, config,
                                                  'run_jobs': run_jobs,
                                       },
                                       parameters = parameters,
-                                      TaskType = MyFakePypeThreadTaskBase)
+        )
         build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)
 
         wf.addTasks([build_rdb_task])
@@ -218,8 +206,6 @@ def run(wf, config,
                     'pread_aln': False,
                     'config': config,
                 },
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/raw-daligner-scatter'
         )
         task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
         wf.addTask(task)
@@ -237,8 +223,7 @@ def run(wf, config,
                    inputs = daligner_out,
                    outputs =  {'gathered': r_gathered_las_plf},
                    parameters = parameters,
-                   TaskType = MyFakePypeThreadTaskBase,
-                   URL = 'task://localhost/rda_check' )
+        )
         check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
         wf.addTask(check_r_da_task)
         wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -257,8 +242,6 @@ def run(wf, config,
                     'db_prefix': 'raw_reads',
                     'config': config,
                 },
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/raw-merge-scatter'
         )
         task = make_task(pype_tasks.task_merge_scatter)
         wf.addTask(task)
@@ -286,8 +269,6 @@ def run(wf, config,
                     'db_prefix': 'raw_reads',
                     'config': config,
                 },
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/raw-cns-scatter'
         )
         task = make_task(pype_tasks.task_consensus_scatter)
         wf.addTask(task)
@@ -310,8 +291,7 @@ def run(wf, config,
                           'preads_fofn': preads_fofn_plf, },
                 outputs = {'pre_assembly_report': pre_assembly_report_plf, },
                 parameters = parameters,
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/report_pre_assembly')
+        )
         task = make_task(pype_tasks.task_report_pre_assembly)
         wf.addTask(task)
 
@@ -330,7 +310,7 @@ def run(wf, config,
         make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf},
                                      outputs = {'o_fofn': preads_fofn_plf},
                                      parameters = {},
-                                     TaskType = MyFakePypeThreadTaskBase)
+        )
         fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_preads)
         wf.addTasks([fofn_abs_task])
         wf.refreshTargets([fofn_abs_task])
@@ -349,8 +329,7 @@ def run(wf, config,
                                                'run_jobs': run_jobs,
                                     },
                                     parameters = parameters,
-                                    TaskType = MyFakePypeThreadTaskBase,
-                                    URL = 'task://localhost/build_pdb')
+    )
     build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)
 
     wf.addTasks([build_pdb_task])
@@ -376,8 +355,6 @@ def run(wf, config,
                 'pread_aln': True,
                 'config': config,
             },
-            TaskType = MyFakePypeThreadTaskBase,
-            URL = 'task://localhost/preads-daligner-scatter'
     )
     task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
     wf.addTask(task)
@@ -394,8 +371,7 @@ def run(wf, config,
                 inputs = daligner_out,
                 outputs =  {'gathered': p_gathered_las_plf},
                 parameters = parameters,
-                TaskType = MyFakePypeThreadTaskBase,
-                URL = 'task://localhost/pda_check' )
+    )
     check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
     wf.addTask(check_p_da_task)
     wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -415,9 +391,7 @@ def run(wf, config,
                 'db_prefix': 'preads',
                 'config': config,
             },
-            TaskType = MyFakePypeThreadTaskBase,
-            #URL = 'task://localhost/preads-merge-scatter'
-        )
+    )
     task = make_task(pype_tasks.task_merge_scatter)
     wf.addTask(task)
     wf.refreshTargets(exitOnFailure=exitOnFailure)
@@ -447,8 +421,7 @@ def run(wf, config,
                              'config': config,
                              'sge_option': config['sge_option_fc'],
                             },
-               TaskType = MyFakePypeThreadTaskBase,
-               URL = 'task://localhost/db2falcon' )
+    )
     wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))
 
     falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done'))
@@ -463,8 +436,7 @@ def run(wf, config,
                              'pread_dir': pread_dir,
                              'sge_option': config['sge_option_fc'],
                },
-               TaskType = MyFakePypeThreadTaskBase,
-               URL = 'task://localhost/falcon_asm' )
+    )
     wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
     wf.refreshTargets()
 
diff --git a/FALCON_unzip/LICENSE.txt b/FALCON_unzip/LICENSE.txt
new file mode 100644
index 0000000..191f795
--- /dev/null
+++ b/FALCON_unzip/LICENSE.txt
@@ -0,0 +1,32 @@
+The Clear BSD License
+
+Copyright (c) 2016, Pacific Biosciences
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted (subject to the limitations in the disclaimer
+below) provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its contributors may be used
+  to endorse or promote products derived from this software without specific
+  prior written permission.
+
+NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS
+LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
diff --git a/FALCON_unzip/README.md b/FALCON_unzip/README.md
new file mode 100644
index 0000000..fd5191c
--- /dev/null
+++ b/FALCON_unzip/README.md
@@ -0,0 +1 @@
+Please refer to https://github.com/PacificBiosciences/FALCON_unzip/wiki
diff --git a/FALCON_unzip/examples/fc_unzip.cfg b/FALCON_unzip/examples/fc_unzip.cfg
new file mode 100644
index 0000000..787360c
--- /dev/null
+++ b/FALCON_unzip/examples/fc_unzip.cfg
@@ -0,0 +1,17 @@
+[General]
+job_type = SGE
+
+[Unzip]
+input_fofn= input.fofn
+input_bam_fofn= input_bam.fofn
+
+smrt_bin=/mnt/secondary/builds/full/3.0.0/prod/current-build_smrtanalysis/smrtcmds/bin/
+
+jobqueue = your_sge_queue
+sge_phasing= -pe smp 12 -q %(jobqueue)s
+sge_quiver= -pe smp 12 -q %(jobqueue)s
+sge_track_reads= -pe smp 12 -q %(jobqueue)s
+sge_blasr_aln=  -pe smp 24 -q %(jobqueue)s
+sge_hasm=  -pe smp 48 -q %(jobqueue)s
+unzip_concurrent_jobs = 64
+quiver_concurrent_jobs = 64
diff --git a/FALCON_unzip/examples/unzip.sh b/FALCON_unzip/examples/unzip.sh
new file mode 100644
index 0000000..529e368
--- /dev/null
+++ b/FALCON_unzip/examples/unzip.sh
@@ -0,0 +1,2 @@
+fc_unzip.py fc_unzip.cfg
+fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/examples/unzip_obsoleted.sh b/FALCON_unzip/examples/unzip_obsoleted.sh
new file mode 100644
index 0000000..88d22da
--- /dev/null
+++ b/FALCON_unzip/examples/unzip_obsoleted.sh
@@ -0,0 +1,33 @@
+# identify raw reads to each contig
+fc_track_reads.py
+# get the reads assigned to each contig from the initial *.subreads.fasta files
+mkdir -p 3-unzip/reads/
+fc_fetch_reads.py
+# phasing the read 
+fc_unzip.py fc_unzip.cfg
+# doing the haplotig assembly, this part will be intergrated into fc_unzip.py
+mkdir -p 3-unzip/1-hasm/
+cd 3-unzip/1-hasm/
+find ../0-phasing/ -name "rid_to_phase.*" | xargs cat  > rid_to_phase.all
+fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map ./rid_to_phase.all > preads.p_ovl
+fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
+fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map ./rid_to_phase.all --fasta ../../1-preads_ovl/preads4falcon.fasta
+WD=$PWD
+for f in `cat ../reads/ctg_list `;do cd $WD/$f; fc_dedup_h_tigs.py $f; done
+# prepare for quviering the haplotig
+cd $WD/..
+rm all_phased_reads all_h_ctg_ids all_h_ctg_edges all_p_ctg_edges all_p_ctg.fa all_h_ctg.fa
+find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
+find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
+find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
+find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
+find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
+find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
+cd ../
+# identify raw reads to each primary contig or haplotig
+fc_track_reads_htigs.py
+# decompose PacBio raw reads with pulse information in bam files into individual bam file for each haplotig or primary contig
+mkdir -p 4-quiver/reads/
+fc_select_reads_from_bam.py input_bam.fofn
+# run blasr and quiver to generate consensus for each haplotig or primary contig
+fc_quiver.py fc_unzip.cfg
diff --git a/FALCON_unzip/falcon_unzip/__init__.py b/FALCON_unzip/falcon_unzip/__init__.py
new file mode 100644
index 0000000..94e4fd5
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/__init__.py
@@ -0,0 +1,36 @@
+#################################################################################$$
+# Copyright (c) 2011-2015, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
diff --git a/FALCON_unzip/falcon_unzip/dedup_h_tigs.py b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
new file mode 100644
index 0000000..61bd8c3
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/dedup_h_tigs.py
@@ -0,0 +1,70 @@
+import os
+import sys
+from falcon_kit.FastaReader import FastaReader
+
+def main(args):
+
+    ctg_id = sys.argv[1]
+
+    if os.path.exists("h_ctg_all.{ctg_id}.fa".format(ctg_id = ctg_id)):
+        os.system("nucmer -mum p_ctg.{ctg_id}.fa h_ctg_all.{ctg_id}.fa -p hp_aln".format(ctg_id = ctg_id))
+        os.system("show-coords -T -H -l -c hp_aln.delta > hp_aln.coor")
+    else:
+        sys.exit(0) #it is ok if there is no h_ctg_all.{ctg_id}.fa, don't want to interupt the workflow
+
+
+    if os.path.exists("hp_aln.coor"):
+        filter_out = set()
+        with open("hp_aln.coor") as f:
+            for row in f:
+                row = row.strip().split()
+                q_cov = float(row[10])
+                idt = float(row[6])
+                if q_cov > 99 and idt > 99:
+                    filter_out.add(row[-1])
+
+        p_ctg_to_phase = {}
+        with open("p_ctg_path.%s" % ctg_id) as f:
+            for row in f:
+                row = row.strip().split()
+                b_id, ph_id = (int(row[-2]), int(row[-1]) )
+                p_ctg_to_phase.setdefault( row[0], {} )
+                p_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
+                p_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
+
+
+        h_ctg_to_phase = {}
+        with open("h_ctg_path.%s" % ctg_id) as f:
+            for row in f:
+                row = row.strip().split()
+                b_id, ph_id = (int(row[-2]), int(row[-1]) )
+                h_ctg_to_phase.setdefault( row[0], {} )
+                h_ctg_to_phase[row[0]].setdefault( ( b_id, ph_id ), 0)
+                h_ctg_to_phase[row[0]][ ( b_id, ph_id ) ] += 1
+
+        h_ids = open("h_ctg_ids.%s" % ctg_id,"w")
+        with open("h_ctg.%s.fa" % ctg_id, "w") as f:
+            h_tig_all = FastaReader("h_ctg_all.%s.fa" % ctg_id)
+            for r in h_tig_all:
+                p_ctg_phase = p_ctg_to_phase.get(r.name.split("_")[0], {})
+
+                if len(r.sequence) < 500:
+                    continue
+
+                if r.name in filter_out:
+                    edge_count = sum(h_ctg_to_phase[ r.name ].values())
+                    same_phase_to_p_ctg_count = 0
+                    for  b_id, ph_id in h_ctg_to_phase[ r.name ]:
+                        if b_id != -1:
+                            if (b_id, ph_id) in p_ctg_phase:
+                                same_phase_to_p_ctg_count += h_ctg_to_phase[ r.name ][ (b_id, ph_id) ]
+                    unphased_edge_count = h_ctg_to_phase[ r.name ] .get( (-1, 0), 0 )
+
+                    print r.name, edge_count, unphased_edge_count, same_phase_to_p_ctg_count
+                    if edge_count - unphased_edge_count - same_phase_to_p_ctg_count < 5: # there are many non-p_ctg phase segment, do not filter out
+                        continue
+
+                print >>f, ">"+r.name
+                print >>f, r.sequence
+                print >> h_ids, r.name
+        h_ids.close()
diff --git a/FALCON_unzip/falcon_unzip/get_read_hctg_map.py b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
new file mode 100644
index 0000000..18909f2
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/get_read_hctg_map.py
@@ -0,0 +1,110 @@
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+        makePypeLocalFile, fn, PypeTask)
+import argparse
+import logging
+import os
+import sys
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def generate_read_to_hctg_map(self):
+    rawread_id_file = fn( self.rawread_id_file )
+    pread_id_file = fn( self.pread_id_file )
+    read_to_contig_map = fn( self.read_to_contig_map )
+
+    pread_did_to_rid = open(pread_id_file).read().split('\n')
+    rid_to_oid = open(rawread_id_file).read().split('\n')
+
+    h_ctg_edges = fn( self.h_ctg_edges )
+    p_ctg_edges = fn( self.p_ctg_edges )
+
+    h_ctg_ids = set()
+    with open(fn(self.h_ctg_ids)) as f:
+        for row in f:
+            row = row.strip()
+            h_ctg_ids.add( row )
+
+    pread_to_contigs = {}
+
+    for fnanme in ( p_ctg_edges, h_ctg_edges):
+        with open(fnanme) as f:
+            for row in f:
+                row = row.strip().split()
+                ctg = row[0]
+                if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids:
+                    continue
+                n1 = row[1]
+                n2 = row[2]
+                pid1 = int(n1.split(':')[0])
+                pid2 = int(n2.split(':')[0])
+                rid1 = pread_did_to_rid[pid1].split('/')[1]
+                rid2 = pread_did_to_rid[pid2].split('/')[1]
+                rid1 = int(int(rid1)/10)
+                rid2 = int(int(rid2)/10)
+                oid1 = rid_to_oid[rid1]
+                oid2 = rid_to_oid[rid2]
+                k1 = (pid1, rid1, oid1)
+                pread_to_contigs.setdefault( k1, set() )
+                pread_to_contigs[ k1 ].add( ctg )
+                k2 = (pid2, rid2, oid2)
+                pread_to_contigs.setdefault( k2, set() )
+                pread_to_contigs[ k2 ].add( ctg )
+
+    with open(read_to_contig_map, 'w') as f:
+        for k in pread_to_contigs:
+            pid, rid, oid = k
+            for ctg in list(pread_to_contigs[ k ]):
+                print >>f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
+
+def get_read_hctg_map(asm_dir, hasm_dir, quiver_dir):
+    wf = PypeProcWatcherWorkflow(
+            max_jobs=12, # TODO: Why was NumThreads ever set? There is only one task!
+    )
+
+    rawread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
+    pread_id_file = makePypeLocalFile(os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
+    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
+    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
+    h_ctg_ids = makePypeLocalFile( os.path.join(hasm_dir, "all_h_ctg_ids"))
+    read_map_dir = os.path.join(quiver_dir, 'read_maps')
+    make_dirs(read_map_dir)
+
+    read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, 'read_to_contig_map'))
+
+    inputs = { 'rawread_id_file': rawread_id_file,
+               'pread_id_file': pread_id_file,
+               'h_ctg_edges': h_ctg_edges,
+               'p_ctg_edges': p_ctg_edges,
+               'h_ctg_ids': h_ctg_ids}
+
+    make_task = PypeTask(
+               inputs = inputs,
+               outputs = {'read_to_contig_map': read_to_contig_map},
+    )
+    wf.addTask(make_task(generate_read_to_hctg_map))
+    wf.refreshTargets() # block
+
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='generate `4-quiver/read_maps/read_to_contig_map` that contains the \
+information from the chain of mapping: (contig id, last col) -> (internal p-read id) -> (internal raw-read id) -> (original read id)\n \
+it assumes the 2-asm-falcon/read_maps ... /dump_rawread_ids/rawread_ids and /dump_pread_ids/pread_ids are already generated',
+              formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--basedir', type=str, default="./", help='the base working dir of a FALCON assembly')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    logging.basicConfig()
+    args = parse_args(argv)
+    basedir = args.basedir
+    #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
+    #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
+    asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
+    hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
+    quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
+
+    get_read_hctg_map(asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
new file mode 100644
index 0000000..fe7fd00
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/graphs_to_h_tigs.py
@@ -0,0 +1,678 @@
+from falcon_kit.fc_asm_graph import AsmGraph
+from falcon_kit.FastaReader import FastaReader
+import os
+import networkx as nx
+from multiprocessing import Pool
+import argparse
+import re
+import sys
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+## for shared memory usage
+global p_asm_G
+global h_asm_G
+global all_rid_to_phase
+global seqs
+
+def mkdir(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+
+def load_sg_seq(all_read_ids, fasta_fn):
+
+    seqs = {}
+    # load all p-read name into memory
+    f = FastaReader(fasta_fn)
+    for r in f:
+        if r.name not in all_read_ids:
+            continue
+        seqs[r.name] = r.sequence.upper()
+    return seqs
+
+def generate_haplotigs_for_ctg(input_):
+
+    ctg_id, out_dir = input_
+    global p_asm_G
+    global h_asm_G
+    global all_rid_to_phase
+    global seqs
+    arid_to_phase = all_rid_to_phase[ctg_id]
+
+    mkdir( out_dir )
+
+    ctg_G = p_asm_G.get_sg_for_ctg(ctg_id)
+
+    ctg_nodes = set(ctg_G.nodes())
+
+    sg = nx.DiGraph()
+
+    for v, w in ctg_G.edges():
+
+        vrid = v[:9]
+        wrid = w[:9]
+
+        edge_data = p_asm_G.sg_edges[ (v, w) ]
+        if edge_data[-1] != "G":
+            continue
+
+        vphase = arid_to_phase.get(vrid, (-1,0))
+        wphase = arid_to_phase.get(wrid, (-1,0))
+        if vphase[0] == wphase[0] and vphase[1] != wphase[1]:
+            cross_phase = "Y"
+        else:
+            cross_phase = "N"
+
+        sg.add_node( v, label= "%d_%d" % vphase,
+                        phase="%d_%d" % vphase,
+                        src="P" )
+
+        sg.add_node( w, label= "%d_%d" % wphase,
+                        phase="%d_%d" % wphase,
+                        src="P" )
+
+        sg.add_edge(v, w, src="OP", cross_phase = cross_phase)
+
+        # we need to add the complimentary edges as the ctg_graph does not contain the dual edges
+        rv = reverse_end(v)
+        rw = reverse_end(w)
+        sg.add_node( rv, label= "%d_%d" % vphase,
+                         phase="%d_%d" % vphase,
+                         src="P" )
+
+        sg.add_node( rw, label= "%d_%d" % wphase,
+                         phase="%d_%d" % wphase,
+                         src="P" )
+
+        sg.add_edge(rw, rv, src="OP", cross_phase = cross_phase)
+
+    PG_nodes = set(sg.nodes())
+    PG_edges = set(sg.edges())
+
+    for v, w in h_asm_G.sg_edges:
+
+        vrid = v[:9]
+        wrid = w[:9]
+
+        if vrid not in arid_to_phase:
+            continue
+        if wrid not in arid_to_phase:
+            continue
+
+        if (v, w) in PG_edges:
+            if p_asm_G.sg_edges[(v,w)][-1] == "G":
+                continue
+
+        edge_data = h_asm_G.sg_edges[ (v, w) ]
+
+        if edge_data[-1] != "G":
+            continue
+
+        cross_phase = "N"
+        if v not in PG_nodes:
+            sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+                            phase="%d_%d" % arid_to_phase[vrid],
+                            src="H" )
+
+        if w not in PG_nodes:
+            sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+                            phase="%d_%d" % arid_to_phase[wrid],
+                            src="H" )
+
+        sg.add_edge(v, w, src="H", cross_phase = cross_phase)
+
+        rv = reverse_end(v)
+        rw = reverse_end(w)
+        if rv not in PG_nodes:
+            sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+                             phase="%d_%d" % arid_to_phase[vrid],
+                             src="H" )
+
+        if rw not in PG_nodes:
+            sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+                             phase="%d_%d" % arid_to_phase[wrid],
+                             src="H" )
+
+        sg.add_edge(rw, rv, src="H", cross_phase = cross_phase)
+
+    sg0 = sg.copy()
+    for v, w in h_asm_G.sg_edges:
+        vrid = v[:9]
+        wrid = w[:9]
+        if vrid not in arid_to_phase:
+            continue
+        if wrid not in arid_to_phase:
+            continue
+
+        if (v, w) in PG_edges:
+            if p_asm_G.sg_edges[(v,w)][-1] == "G":
+                continue
+
+        edge_data = h_asm_G.sg_edges[ (v, w) ]
+
+        if sg0.in_degree(w) == 0:
+            cross_phase = "Y"
+            if v not in PG_nodes:
+                sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+                                phase="%d_%d" % arid_to_phase[vrid],
+                                src="H" )
+
+            if w not in PG_nodes:
+                sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+                                phase="%d_%d" % arid_to_phase[wrid],
+                                src="H" )
+
+            sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
+
+            rv = reverse_end(v)
+            rw = reverse_end(w)
+            if rv not in PG_nodes:
+                sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+                                 phase="%d_%d" % arid_to_phase[vrid],
+                                 src="H" )
+
+            if rw not in PG_nodes:
+                sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+                                 phase="%d_%d" % arid_to_phase[wrid],
+                                 src="H" )
+
+            sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
+
+        if sg0.out_degree(v) == 0:
+            cross_phase = "Y"
+            if v not in PG_nodes:
+                sg.add_node( v, label= "%d_%d" % arid_to_phase[vrid],
+                                phase="%d_%d" % arid_to_phase[vrid],
+                                src="H" )
+
+            if w not in PG_nodes:
+                sg.add_node( w, label= "%d_%d" % arid_to_phase[wrid],
+                                phase="%d_%d" % arid_to_phase[wrid],
+                                src="H" )
+
+            sg.add_edge(v, w, src="ext", cross_phase = cross_phase)
+
+            rv = reverse_end(v)
+            rw = reverse_end(w)
+            if rv not in PG_nodes:
+                sg.add_node( rv, label= "%d_%d" % arid_to_phase[vrid],
+                                 phase="%d_%d" % arid_to_phase[vrid],
+                                 src="H" )
+
+            if rw not in PG_nodes:
+                sg.add_node( rw, label= "%d_%d" % arid_to_phase[wrid],
+                                 phase="%d_%d" % arid_to_phase[wrid],
+                                 src="H" )
+
+            sg.add_edge(rw, rv, src="ext", cross_phase = cross_phase)
+
+    sg2 = sg.copy()
+    ctg_nodes_r = set([ reverse_end(v) for v in list(ctg_nodes) ])
+    for v, w in ctg_G.edges():
+        sg2.remove_edge(v, w)
+        rv, rw = reverse_end(v), reverse_end(w)
+        sg2.remove_edge(rw, rv)
+    for v in sg2.nodes():
+        if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
+            sg2.remove_node(v)
+
+    nodes_to_remove = set()
+    edges_to_remove = set()
+    for sub_g in nx.weakly_connected_component_subgraphs(sg2):
+        sub_g_nodes = set(sub_g.nodes())
+        if len(sub_g_nodes & ctg_nodes_r) > 0 and len(sub_g_nodes & ctg_nodes) > 0:
+            # remove cross edge
+            sources = [n for n in sub_g.nodes() if sub_g.in_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
+            sinks = [n for n in sub_g.nodes() if sub_g.out_degree(n) == 0 or n in ctg_nodes or n in ctg_nodes_r ]
+            edges_to_keep = set()
+            for v in sources:
+                for w in sinks:
+                    path = []
+                    if v in ctg_nodes and w not in ctg_nodes_r:
+                        try:
+                            path = nx.shortest_path( sub_g, v, w )
+                        except nx.exception.NetworkXNoPath:
+                            path = []
+                    elif v not in ctg_nodes and w in ctg_nodes_r:
+                        try:
+                            path = nx.shortest_path( sub_g, v, w )
+                        except nx.exception.NetworkXNoPath:
+                            path = []
+
+                    if len(path) >= 2:
+                        v1 = path[0]
+                        for w1 in path[1:]:
+                            edges_to_keep.add( (v1, w1) )
+                            rv1, rw1 = reverse_end(v1), reverse_end(w1)
+                            edges_to_keep.add( (rw1, rv1) )
+                            v1 = w1
+            for v, w in sub_g.edges():
+                if (v, w) not in edges_to_keep:
+                    edges_to_remove.add( (v, w) )
+                    rv, rw = reverse_end(v), reverse_end(w)
+                    edges_to_remove.add( (rw, rv) )
+
+
+        if len(sub_g_nodes & ctg_nodes_r) == 0 and len(sub_g_nodes & ctg_nodes) == 0:
+            nodes_to_remove.update( sub_g_nodes )
+            nodes_to_remove.update( set( [reverse_end(v) for v in list(sub_g_nodes)] ) )
+
+    for v, w in list(edges_to_remove):
+        sg.remove_edge(v, w)
+
+    for v in nodes_to_remove:
+        sg.remove_node(v)
+
+    for v in sg.nodes():
+        if sg.out_degree(v) == 0 and sg.in_degree(v) == 0:
+            sg.remove_node(v)
+
+    #nx.write_gexf(sg, "full_g.gexf")
+
+    s_node = p_asm_G.ctg_data[ctg_id][5][0][0]
+    t_node = p_asm_G.ctg_data[ctg_id][5][-1][-1]
+
+    for v, w in sg.edges():
+        phase0 = sg.node[v]["phase"].split("_")
+        phase1 = sg.node[w]["phase"].split("_")
+        if phase0 == phase1:
+            sg[v][w]["weight"] = 10
+            sg[v][w]["score"] = 1
+            sg[v][w]["label"] = "type0"
+        else:
+            if phase0[0] == phase1[0]:
+                sg[v][w]["weight"] = 1
+                sg[v][w]["score"] = 100000
+                sg[v][w]["label"] = "type1"
+            else:
+                sg[v][w]["weight"] = 5
+                sg[v][w]["score"] = 50
+                sg[v][w]["label"] = "type2"
+
+
+    sg2 = sg.copy()
+    edge_to_remove = set()
+    for v, w in sg2.edges():
+        if sg2[v][w]["src"] == "ext":
+            edge_to_remove.add( (v, w) )
+            rv, rw = reverse_end(v), reverse_end(w)
+            edge_to_remove.add( (rw, rv) )
+
+        if sg2.node[v]["phase"] ==  sg2.node[w]["phase"]:
+            continue
+        flag1 = 0
+        flag2 = 0
+        for e in sg2.out_edges(v):
+            if sg2.node[e[0]]["phase"] ==  sg2.node[e[1]]["phase"]:
+                flag1 = 1
+                break
+        if flag1 == 1:
+            for e in sg2.in_edges(w):
+                if sg2.node[e[0]]["phase"] ==  sg2.node[e[1]]["phase"]:
+                    flag2 = 1
+                    break
+        if flag2 == 1:
+            edge_to_remove.add( (v, w) )
+            rv, rw = reverse_end(v), reverse_end(w)
+            edge_to_remove.add( (rw, rv) )
+
+    for v, w in list(edge_to_remove):
+        sg2.remove_edge(v, w)
+
+    ## remove some new short cut due to repeats in the haplotype edges
+    edge_to_remove = set()
+    with open(os.path.join(out_dir, "path_len.%s" % ctg_id), "w") as f:
+
+        all_length_from_s = nx.shortest_path_length(sg, source=s_node)
+        edge_to_remove = set()
+
+        for w in all_length_from_s:
+            longest = 0
+            if sg.in_degree(w) >= 2:
+                for e in sg.in_edges(w):
+                    v = e[0]
+                    if v in all_length_from_s:
+                        len_ = all_length_from_s[v]
+                        if len_ > longest:
+                            longest = len_
+                if longest == 0:
+                    continue
+                for e in sg.in_edges(w):
+                    v = e[0]
+                    if v in all_length_from_s:
+                        len_ = all_length_from_s[v]
+                        print >>f, ctg_id, "link_lengths", v, w, longest, len_
+                        if longest - len_ > 10 and sg[v][w]["src"] != "OP":  #make sure we always have one path drived from the original graph from s to t
+                            edge_to_remove.add( (v, w) )
+                            rv, rw = reverse_end(v), reverse_end(w)
+                            edge_to_remove.add( (rw, rv) )
+
+        for v, w in list(edge_to_remove):
+            sg.remove_edge(v, w)
+            print >>f, ctg_id, "removed", v, w
+
+        for v, w in list(edge_to_remove):
+            try:
+                sg2.remove_edge(v, w)
+            except Exception:
+                pass
+
+
+    nx.write_gexf(sg, os.path.join(out_dir, "sg.gexf" ))
+    nx.write_gexf(sg2, os.path.join(out_dir, "sg2.gexf" ))
+
+
+    try:
+        s_path = nx.shortest_path(sg2, source=s_node, target=t_node, weight="score")
+    except nx.exception.NetworkXNoPath:
+        s_path = nx.shortest_path(sg, source=s_node, target=t_node, weight="score")
+
+    s_path_edges = []
+    for i in xrange(len(s_path)-1):
+        v = s_path[i]
+        w = s_path[i+1]
+        sg[v][w]["weight"] = 15
+        s_path_edges.append( (v,w) )
+
+    s_path_edge_set = set(s_path_edges)
+
+
+
+    #output the updated primary contig
+    p_tig_path = open(os.path.join(out_dir, "p_ctg_path.%s" % ctg_id),"w")
+    p_tig_fa = open(os.path.join(out_dir, "p_ctg.%s.fa" % ctg_id),"w")
+    edges_to_remove1 = set()
+    edges_to_remove2 = set()
+    with open(os.path.join(out_dir, "p_ctg_edges.%s" % ctg_id), "w") as f:
+        seq = []
+        for v, w in s_path_edges:
+            sg[v][w]["h_edge"] = 1
+            vrid = v.split(":")[0]
+            wrid = w.split(":")[0]
+            vphase = arid_to_phase.get(vrid, (-1,0))
+            wphase = arid_to_phase.get(wrid, (-1,0))
+            print >>f, "%s" % ctg_id, v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
+
+            if sg.edge[v][w]["src"] == "OP":
+                edge_data = p_asm_G.sg_edges[ (v,w) ]
+            else:
+                edge_data = h_asm_G.sg_edges[ (v,w) ]
+
+            seq_id, s, t = edge_data[0]
+            if s < t:
+                seq.append(seqs[ seq_id ][ s:t ])
+            else:
+                seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ s:t:-1 ] ]))
+            print >>p_tig_path, "%s" % ctg_id, v, w, seq_id, s, t, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
+            sg[v][w]["tig_id"] = "%s" % ctg_id
+
+            rv, rw = reverse_end(v), reverse_end(w)
+            edges_to_remove1.add( (v, w) )
+            edges_to_remove2.add( (rw, rv) )
+
+        print >> p_tig_fa, ">%s" % ctg_id
+        print >> p_tig_fa, "".join(seq)
+
+    p_tig_fa.close()
+    p_tig_path.close()
+
+
+
+    sg2 = sg.copy()
+    reachable1 = nx.descendants(sg2, s_node)
+    sg2_r = sg2.reverse()
+    reachable2 = nx.descendants(sg2_r, t_node)
+
+    reachable_all = reachable1 | reachable2
+    reachable_both = reachable1 & reachable2
+
+
+    for v, w in list(edges_to_remove2 | edges_to_remove1):
+        sg2.remove_edge( v, w )
+
+    for v, w in sg2.edges():
+        if sg2[v][w]["cross_phase"] == "Y":
+            sg2.remove_edge( v, w )
+
+    for v in sg2.nodes():
+        if v not in reachable_all:
+            sg2.remove_node(v)
+
+    for v in sg2.nodes():
+        if sg2.out_degree(v) == 0 and sg2.in_degree(v) == 0:
+            sg2.remove_node(v)
+            continue
+        if v in reachable_both:
+            sg2.node[v]["reachable"] = 1
+        else:
+            sg2.node[v]["reachable"] = 0
+
+    dump_graph = False # the code segement below is useful for showing the graph
+    if dump_graph == True:
+        nx.write_gexf(sg2, "%s_1.gexf" % ctg_id)
+
+    p_path_nodes = set(s_path)
+    p_path_rc_nodes = set( [reverse_end(v) for v in s_path] )
+
+    sg2_nodes = set(sg2.nodes())
+    for v in p_asm_G.get_sg_for_ctg(ctg_id).nodes():
+        rv = reverse_end(v)
+        p_path_rc_nodes.add( rv )
+        if rv in sg2_nodes:
+            sg2.remove_node(rv)
+
+
+    h_tig_path = open(os.path.join(out_dir, "h_ctg_path.%s" % ctg_id),"w")
+    h_tig_fa = open(os.path.join(out_dir, "h_ctg_all.%s.fa" % ctg_id),"w")
+    edges_to_remove = set()
+
+    labelled_node = set()
+    with open(os.path.join(out_dir, "h_ctg_edges.%s" % ctg_id),"w") as f:
+        h_tig_id = 1
+        h_paths = {}
+        #print "number of components:", len([tmp for tmp in nx.weakly_connected_component_subgraphs(sg2)])
+        for sub_hg_0 in nx.weakly_connected_component_subgraphs(sg2):
+            sub_hg = sub_hg_0.copy()
+            while sub_hg.size() > 5:
+                #print "sub_hg size:", len(sub_hg.nodes())
+                sources = [n for n in sub_hg.nodes() if sub_hg.in_degree(n) != 1 ]
+                sinks = [n for n in sub_hg.nodes() if sub_hg.out_degree(n) != 1 ]
+
+
+                #print "number of sources", len(sources),  sources
+                #print "number of sinks", len(sinks), sinks
+                if len(sources) == 0 and len(sinks) == 0: #TODO, the rest of the sub-graph are circles, we need to break and print warnning message
+                    break
+
+                longest = []
+
+                eliminated_sinks = set()
+                s_longest = {}
+                for s in sources:
+                    #print "test source",s, len(eliminated_sinks)
+                    if s in labelled_node:
+                        continue
+                    s_path = []
+                    for t in sinks:
+                        if t in eliminated_sinks:
+                            continue
+                        try:
+                            path = nx.shortest_path(sub_hg, s, t, weight="score")
+                            #print "test path len:", len(path), s, t
+                        except nx.exception.NetworkXNoPath:
+                            path = []
+                            continue
+                        s_path.append( [ path, t ] )
+                    s_path.sort(key = lambda x: -len(x[0]))
+                    if len(s_path) == 0:
+                        continue
+                    s_longest[s] = s_path[0][0]
+                    if len(s_longest[s]) > len(longest):
+                        longest = s_longest[s]
+                        #print "s longest", longest[0], longest[-1], len(longest)
+                    for path, t in s_path[1:]:
+                        eliminated_sinks.add(t)
+                        #print "elimated t", t
+
+
+                if len(longest) == 0:
+                    break
+
+                s = longest[0]
+                t = longest[-1]
+                h_paths[ ( s, t ) ] = longest
+
+                labelled_node.add(s)
+                rs = reverse_end(s)
+                labelled_node.add(rs)
+
+                for v in longest:
+                    sub_hg.remove_node(v)
+
+        for s, t in h_paths:
+            longest = h_paths[ (s, t) ]
+            #print "number of node in path", s,t,len(longest)
+            seq = []
+            for v, w in zip(longest[:-1], longest[1:]):
+                sg[v][w]["h_edge"] = 1
+                if sg.edge[v][w]["src"] == "OP":
+                    edge_data = p_asm_G.sg_edges[ (v,w) ]
+                else:
+                    edge_data = h_asm_G.sg_edges[ (v,w) ]
+                vrid = v.split(":")[0]
+                wrid = w.split(":")[0]
+                vphase = arid_to_phase.get(vrid, (-1,0))
+                wphase = arid_to_phase.get(wrid, (-1,0))
+                print >>f, "%s_%03d" % (ctg_id, h_tig_id), v, w, sg[v][w]["cross_phase"], sg[v][w]["src"], vphase[0], vphase[1], wphase[0], wphase[1]
+
+                if sg.edge[v][w]["src"] == "OP":
+                    edge_data = p_asm_G.sg_edges[ (v,w) ]
+                else:
+                    edge_data = h_asm_G.sg_edges[ (v,w) ]
+
+                seq_id, sp, tp = edge_data[0]
+                if sp < tp:
+                    seq.append(seqs[ seq_id ][ sp:tp ])
+                else:
+                    seq.append("".join([ RCMAP[c] for c in seqs[ seq_id ][ sp:tp:-1 ] ]))
+                print >> h_tig_path, "%s_%03d" % (ctg_id, h_tig_id), v, w, seq_id, sp, tp, edge_data[1], edge_data[2], "%d %d" % arid_to_phase.get(seq_id, (-1,0))
+                sg[v][w]["tig_id"] = "%s_%03d" % (ctg_id, h_tig_id)
+
+                rv, rw = reverse_end(v), reverse_end(w)
+                edges_to_remove.add( (v, w) )
+                edges_to_remove.add( (rw, rv) )
+
+            print >> h_tig_fa, ">%s_%03d" % (ctg_id, h_tig_id)
+            print >> h_tig_fa, "".join(seq)
+            h_tig_id += 1
+
+
+    h_tig_fa.close()
+    h_tig_path.close()
+
+    dump_graph = False  # the code segement below is useful for showing the graph
+    if dump_graph == True:
+        for v, w in sg.edges():
+            if "h_edge" not in sg[v][w]:
+                sg[v][w]["h_edge"] = 0
+            if v in reachable_all:
+                sg.node[v]["reachable"] = 1
+            else:
+                sg.node[v]["reachable"] = 0
+            if w in reachable_all:
+                sg.node[w]["reachable"] = 1
+            else:
+                sg.node[w]["reachable"] = 0
+
+        nx.write_gexf(sg, "%s_0.gexf" % ctg_id)
+
+
+
+def parse_args(argv):
+
+    parser = argparse.ArgumentParser(description='layout haplotigs from primary assembly graph and phased aseembly graph')
+
+    parser.add_argument('--fc_asm_path', type=str, help='path to the primary Falcon assembly output directory', required=True)
+    parser.add_argument('--fc_hasm_path', type=str, help='path to the phased Falcon assembly output directory', required=True)
+    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', default = "all", required=True)
+    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+    parser.add_argument('--rid_phase_map', type=str, help="path to the file that encode the relationship of the read id to phase blocks", required=True)
+    parser.add_argument('--fasta', type=str, help="sequence file of the p-reads", required=True)
+    args = parser.parse_args(argv[1:])
+
+    return args
+
+def main(argv=sys.argv):
+
+    # make life easier for now. will refactor it out if possible
+    global all_rid_to_phase
+    global p_asm_G
+    global h_asm_G
+    global all_rid_to_phase
+    global seqs
+
+    args = parse_args(argv)
+    fc_asm_path = args.fc_asm_path
+    fc_hasm_path = args.fc_hasm_path
+    ctg_id = args.ctg_id
+    base_dir = args.base_dir
+    fasta_fn = args.fasta
+
+    p_asm_G = AsmGraph(os.path.join(fc_asm_path, "sg_edges_list"),
+                       os.path.join(fc_asm_path, "utg_data"),
+                       os.path.join(fc_asm_path, "ctg_paths") )
+
+    h_asm_G = AsmGraph( os.path.join(fc_hasm_path, "sg_edges_list"),
+                        os.path.join(fc_hasm_path, "utg_data"),
+                        os.path.join(fc_hasm_path, "ctg_paths") )
+
+
+    all_rid_to_phase = {}
+
+    all_read_ids = set()
+    with open(args.rid_phase_map) as f:
+        for row in f:
+            row = row.strip().split()
+            all_rid_to_phase.setdefault( row[1], {} )
+            all_rid_to_phase[row[1]][row[0]] = (int(row[2]), int(row[3]))
+            all_read_ids.add(row[0])
+
+    for v, w in p_asm_G.sg_edges:
+        if p_asm_G.sg_edges[ (v, w) ][-1] != "G":
+            continue
+        v = v.split(":")[0]
+        w = w.split(":")[0]
+        all_read_ids.add(v)
+        all_read_ids.add(w)
+
+    for v, w in h_asm_G.sg_edges:
+        if h_asm_G.sg_edges[ (v, w) ][-1] != "G":
+            continue
+        v = v.split(":")[0]
+        w = w.split(":")[0]
+        all_read_ids.add(v)
+        all_read_ids.add(w)
+
+    seqs = load_sg_seq(all_read_ids, fasta_fn)
+
+    if ctg_id == "all":
+        ctg_id_list = p_asm_G.ctg_data.keys()
+    else:
+        ctg_id_list = [ctg_id]
+
+    exe_list = []
+    for ctg_id in ctg_id_list:
+        if ctg_id[-1] != "F":
+            continue
+        if ctg_id not in all_rid_to_phase:
+            continue
+        exe_list.append( (ctg_id, os.path.join(".", ctg_id)) )
+
+    exec_pool = Pool(4)  #TODO, make this configurable
+    exec_pool.map( generate_haplotigs_for_ctg, exe_list)
+    #map( generate_haplotigs_for_ctg, exe_list)
diff --git a/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
new file mode 100644
index 0000000..ab92956
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/ovlp_filter_with_phase.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python
+
+#################################################################################$$
+# Copyright (c) 2011-2014, Pacific Biosciences of California, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted (subject to the limitations in the
+# disclaimer below) provided that the following conditions are met:
+#
+#  * Redistributions of source code must retain the above copyright
+#  notice, this list of conditions and the following disclaimer.
+#
+#  * Redistributions in binary form must reproduce the above
+#  copyright notice, this list of conditions and the following
+#  disclaimer in the documentation and/or other materials provided
+#  with the distribution.
+#
+#  * Neither the name of Pacific Biosciences nor the names of its
+#  contributors may be used to endorse or promote products derived
+#  from this software without specific prior written permission.
+#
+# NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
+# GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
+# BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#################################################################################$$
+
+from multiprocessing import Pool
+import subprocess as sp
+import shlex
+import argparse
+import re
+import sys
+
+arid2phase = {}
+
+def filter_stage1(input_):
+    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len = input_
+    try:
+        ignore_rtn = []
+        current_q_id = None
+        ave_idt = 0.0
+        all_over_len = 0.0
+        overlap_data = {"5p":0, "3p":0}
+        overlap_phase = {"5p":set(), "3p":set()}
+        q_id = None
+
+        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) ) ).splitlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+            if q_id not in arid2phase:
+                continue
+            if t_id not in arid2phase:
+                continue
+            if arid2phase[t_id][0] != arid2phase[q_id][0]:
+                continue
+
+            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+                continue
+
+            if q_id != None and q_id != current_q_id:
+
+                left_count = overlap_data["5p"]
+                right_count = overlap_data["3p"]
+
+                if abs(left_count - right_count) > max_diff:
+                    ignore_rtn.append( current_q_id )
+                elif left_count > max_ovlp or right_count > max_ovlp:
+                    ignore_rtn.append( current_q_id )
+                elif left_count < min_ovlp or right_count < min_ovlp:
+                    ignore_rtn.append( current_q_id )
+
+                #remove unphased reads if they are sandwiched by the reads from the same phase
+                if current_q_id not in arid2phase:
+                    left_set = overlap_phase["5p"]
+                    right_set = overlap_phase["3p"]
+                    if len( left_set & right_set ) > 0:
+                        ignore_rtn.append( current_q_id )
+
+                overlap_data = {"5p":0, "3p":0}
+                overlap_phase = {"5p":set(), "3p":set()}
+                current_q_id = q_id
+                ave_idt = 0.0
+                all_over_len = 0.0
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90:
+                continue
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if l[-1] in ("contains", "overlap"):
+                ave_idt += idt * overlap_len
+                all_over_len += overlap_len
+            if q_s == 0:
+                overlap_data["5p"] += 1
+                if t_id in arid2phase:
+                    overlap_phase["5p"].add( arid2phase[t_id] )
+            if q_e == q_l:
+                overlap_data["3p"] += 1
+                if t_id in arid2phase:
+                    overlap_phase["3p"].add( arid2phase[t_id] )
+
+        if q_id !=  None:
+            left_count = overlap_data["5p"]
+            right_count = overlap_data["3p"]
+            if abs(left_count - right_count) > max_diff:
+                ignore_rtn.append( current_q_id )
+            elif left_count > max_ovlp or right_count > max_ovlp:
+                ignore_rtn.append( current_q_id )
+            elif left_count < min_ovlp or right_count < min_ovlp:
+                ignore_rtn.append( current_q_id )
+
+            #remove unphased reads if they are sandwiched by the reads from the same phase
+            if current_q_id not in arid2phase:
+                left_set = overlap_phase["5p"]
+                right_set = overlap_phase["3p"]
+                if len( left_set & right_set ) > 0:
+                    ignore_rtn.append( current_q_id )
+
+
+        return fn, ignore_rtn
+
+    except (KeyboardInterrupt, SystemExit):
+        return
+
+def filter_stage2(input_):
+    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set = input_
+    try:
+        contained_id = set()
+        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn))).splitlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+
+            if q_id not in arid2phase:
+                continue
+            if t_id not in arid2phase:
+                continue
+
+            if arid2phase[t_id][0] != arid2phase[q_id][0]:
+                continue
+            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+                continue
+
+
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            idt = float(l[3])
+            if idt < 90:
+                continue
+
+            if q_l < min_len or t_l < min_len:
+                continue
+
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+            if l[-1] == "contained":
+                contained_id.add(q_id)
+            if l[-1] == "contains":
+                contained_id.add(t_id)
+        return fn, contained_id
+
+    except (KeyboardInterrupt, SystemExit):
+        return
+
+def filter_stage3(input_):
+
+    db_fn, fn, max_diff, max_ovlp, min_ovlp, min_len, ignore_set, contained_set, bestn = input_
+
+    overlap_data = {"5p":[], "3p":[]}
+    try:
+        ovlp_output = []
+        current_q_id = None
+        for l in sp.check_output(shlex.split("LA4Falcon -mo %s %s" % (db_fn, fn) )).splitlines():
+            l = l.strip().split()
+            q_id, t_id = l[:2]
+
+
+            if q_id not in arid2phase:
+                continue
+            if t_id not in arid2phase:
+                continue
+
+            if arid2phase[t_id][0] != arid2phase[q_id][0]:
+                continue
+            if arid2phase[t_id][1] == arid2phase[q_id][1] and arid2phase[t_id][2] != arid2phase[q_id][2]:
+                continue
+
+            if current_q_id == None:
+                current_q_id = q_id
+                overlap_data = {"5p":[], "3p":[]}
+
+            elif q_id != current_q_id:
+                left = overlap_data["5p"]
+                right = overlap_data["3p"]
+                left.sort()
+                right.sort()
+
+                for i in xrange(len(left)):
+                    inphase, score, m_range, ovlp = left[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id]
+                    if i >= bestn and m_range > 1000:
+                        break
+
+                for i in xrange(len(right)):
+                    inphase, score, m_range, ovlp = right[i]
+                    ovlp_output.append(ovlp)
+                    #print " ".join(ovlp), read_end_data[current_q_id]
+                    if i >= bestn and m_range > 1000:
+                        break
+
+                overlap_data = {"5p":[], "3p":[]}
+                current_q_id = q_id
+
+            if q_id in contained_set:
+                continue
+            if t_id in contained_set:
+                continue
+            if q_id in ignore_set:
+                continue
+            if t_id in ignore_set:
+                continue
+
+            overlap_len = -int(l[2])
+            idt = float(l[3])
+            q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+            t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+
+            if idt < 90:
+                continue
+            if q_l < min_len or t_l < min_len:
+                continue
+            if q_s == 0:
+                l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
+                inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
+                #nphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
+                overlap_data["5p"].append( (-inphase, -overlap_len,  t_l - (t_e - t_s),  l ) )
+            elif q_e == q_l:
+                l.extend( [".".join(arid2phase.get(current_q_id, "NA")), ".".join(arid2phase.get(t_id, "NA"))])
+                inphase = 1 if arid2phase.get(current_q_id, "NA") == arid2phase.get(t_id, "NA") else 0
+                #inphase = 1 if arid2phase[current_q_id] == arid2phase[t_id] else 0
+                overlap_data["3p"].append( (-inphase, -overlap_len, t_l - (t_e - t_s), l ) )
+
+        left = overlap_data["5p"]
+        right = overlap_data["3p"]
+        left.sort()
+        right.sort()
+
+
+        for i in xrange(len(left)):
+            inphase, score, m_range, ovlp = left[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id]
+            if i >= bestn and m_range > 1000:
+                break
+
+        for i in xrange(len(right)):
+            inphase, score, m_range, ovlp = right[i]
+            ovlp_output.append(ovlp)
+            #print " ".join(ovlp), read_end_data[current_q_id]
+            if i >= bestn and m_range > 1000:
+                break
+
+        return fn, ovlp_output
+    except (KeyboardInterrupt, SystemExit):
+        return
+
+
+def parse_args(argv):
+
+    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for generating consensus')
+    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    parser.add_argument('--db', type=str, help='read db file path')
+    parser.add_argument('--max_diff', type=int, help="max difference of 5' and 3' coverage")
+    parser.add_argument('--max_cov', type=int, help="max coverage of 5' or 3' coverage")
+    parser.add_argument('--min_cov', type=int, help="min coverage of 5' or 3' coverage")
+    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
+    parser.add_argument('--bestn', type=int, default=10, help="output at least best n overlaps on 5' or 3' ends if possible")
+    parser.add_argument('--rid_phase_map', type=str, help="the file that encode the relationship of the read id to phase blocks", required=True)
+    args = parser.parse_args(argv[1:])
+
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    max_diff = args.max_diff
+    max_cov = args.max_cov
+    min_cov = args.min_cov
+    min_len = args.min_len
+    bestn = args.bestn
+    db_fn = args.db
+
+    with open(args.rid_phase_map) as f:
+        for row in f:
+            row = row.strip().split()
+            arid2phase[row[0]] = (row[1], row[2], row[3]) #ctg_id, phase_blk_id, phase_id
+
+    exe_pool = Pool(args.n_core)
+
+    file_list = open(args.fofn).read().split("\n")
+    inputs = []
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len) )
+
+    ignore_all = []
+    for res in exe_pool.imap(filter_stage1, inputs):
+        ignore_all.extend( res[1] )
+
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all) )
+    contained = set()
+    for res in exe_pool.imap(filter_stage2, inputs):
+        contained.update(res[1])
+        #print res[0], len(res[1]), len(contained)
+
+    #print "all", len(contained)
+    inputs = []
+    ignore_all = set(ignore_all)
+    for fn in file_list:
+        if len(fn) != 0:
+            inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn) )
+    for res in exe_pool.imap(filter_stage3, inputs):
+        for l in res[1]:
+            print " ".join(l)
diff --git a/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
new file mode 100644
index 0000000..772f011
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phased_ovlp_to_graph.py
@@ -0,0 +1,1545 @@
+import networkx as nx
+import os
+import shlex
+import sys
+import subprocess
+import argparse
+
+DEBUG_LOG_LEVEL = 0
+
+class SGNode(object):
+    """
+    class representing a node in the string graph
+    """
+    def __init__(self, node_name):
+        self.name = node_name
+        self.out_edges = []
+        self.in_edges = []
+
+    def add_out_edge(self, out_edge):
+        self.out_edges.append(out_edge)
+
+    def del_out_edge(self, out_edge):
+        self.out_edges.remove(out_edge)
+
+    def add_in_edge(self, in_edge):
+        self.in_edges.append(in_edge)
+
+    def del_in_edge(self, in_edge):
+        self.in_edges.remove(in_edge)
+
+class SGEdge(object):
+    """
+    class representing an edge in the string graph
+    """
+    def __init__(self, in_node, out_node):
+        self.in_node = in_node
+        self.out_node = out_node
+        self.attr = {}
+    def set_attribute(self, attr, value):
+        self.attr[attr] = value
+
+def reverse_end( node_id ):
+    node_id, end = node_id.split(":")
+    new_end = "B" if end == "E" else "E"
+    return node_id + ":" + new_end
+
+class StringGraph(object):
+    """
+    class representing the string graph
+    """
+    def __init__(self):
+        self.nodes = {}
+        self.edges = {}
+        self.n_mark = {}
+        self.e_reduce = {}
+        self.repeat_overlap = {}
+
+    def add_node(self, node_name):
+        """
+        add a node into the graph by given a node name
+        """
+        if node_name not in self.nodes:
+            self.nodes[node_name] = SGNode(node_name)
+
+    def add_edge(self, in_node_name, out_node_name, **attributes):
+        """
+        add an edge into the graph by given a pair of nodes
+        """
+        if (in_node_name, out_node_name) not in self.edges:
+
+            self.add_node(in_node_name)
+            self.add_node(out_node_name)
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]
+
+            edge = SGEdge(in_node, out_node)
+            self.edges[ (in_node_name, out_node_name) ] = edge
+            in_node.add_out_edge(edge)
+            out_node.add_in_edge(edge)
+        edge =  self.edges[ (in_node_name, out_node_name) ]
+        for k, v in attributes.items():
+            edge.attr[k] = v
+
+    def remove_cross_phase_edges(self):
+
+        to_remove = {}
+
+        for k,e in self.edges.items():
+            if e.attr["phase"][0] != e.attr["phase"][1]:
+                to_remove[k] = e
+
+        for k, e in to_remove.items():
+            in_node_name, out_node_name = k
+            in_node = self.nodes[in_node_name]
+            out_node = self.nodes[out_node_name]
+            in_node.del_out_edge(e)
+            out_node.del_in_edge(e)
+            del self.edges[k]
+
+        return to_remove
+
+    def init_reduce_dict(self):
+        for e in self.edges:
+            self.e_reduce[e] = False
+
+    def bfs_nodes(self, n, exclude = None, depth=5):
+        all_nodes = set()
+        all_nodes.add(n)
+        candidate_nodes = set()
+        candidate_nodes.add(n)
+        dp = 1
+        while  dp < depth and len(candidate_nodes) > 0:
+            v = candidate_nodes.pop()
+            for e in v.out_edges :
+                w = e.out_node
+                if w == exclude:
+                    continue
+                if w not in all_nodes:
+                    all_nodes.add(w)
+                    if len(w.out_edges) > 0:
+                        candidate_nodes.add(w)
+            dp += 1
+
+        return all_nodes
+
+
+    def mark_chimer_edges(self):
+
+        multi_in_nodes = {}
+        multi_out_nodes = {}
+        for n_name in self.nodes:
+            n = self.nodes[n_name]
+            out_nodes = [ e.out_node for e in n.out_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False ]
+            in_nodes = [e.in_node for e in n.in_edges if self.e_reduce[(e.in_node.name, e.out_node.name)] == False]
+
+            if len(out_nodes) >= 2:
+                multi_out_nodes[n_name] = out_nodes
+            if len(in_nodes) >= 2:
+                multi_in_nodes[n_name] = in_nodes
+
+        chimer_candidates = set()
+        out_set = set()
+        in_set = set()
+        for n_name in multi_out_nodes:
+            out_nodes = set(multi_out_nodes[n_name])
+            out_set |= out_nodes
+
+        for n_name in multi_in_nodes:
+            in_nodes = set(multi_in_nodes[n_name])
+            in_set |= in_nodes
+
+        chimer_candidates = out_set & in_set
+
+        chimer_nodes = []
+        chimer_edges = set()
+        for n in chimer_candidates:
+            out_nodes = set( [ e.out_node for e in n.out_edges ] )
+            test_set = set()
+            for in_node in [e.in_node for e in n.in_edges ]:
+                test_set = test_set | set( [ e.out_node for e in in_node.out_edges ] )
+            test_set -= set([n])
+            if len( out_nodes & test_set ) == 0:
+                flow_node1 = set()
+                flow_node2 = set()
+                for v in list(out_nodes):
+                    flow_node1 |= self.bfs_nodes(v, exclude=n)
+                for v in list(test_set):
+                    flow_node2 |= self.bfs_nodes(v, exclude=n)
+                if len( flow_node1 & flow_node2 ) == 0:
+                    for e in n.out_edges:
+                        v, w =  e.in_node.name, e.out_node.name
+                        if self.e_reduce[ (v, w) ] != True:
+                            self.e_reduce[ (v, w) ] = True
+                            chimer_edges.add( (v, w) )
+                            rv = reverse_end(w)
+                            rw = reverse_end(v)
+                            self.e_reduce[ (rv, rw) ] = True
+                            chimer_edges.add( (rv, rw) )
+
+                    for e in n.in_edges:
+                        v, w =  e.in_node.name, e.out_node.name
+                        if self.e_reduce[ (v, w) ] != True:
+                            self.e_reduce[ (v, w) ] = True
+                            chimer_edges.add( (v, w) )
+                            rv = reverse_end(w)
+                            rw = reverse_end(v)
+                            self.e_reduce[ (rv, rw) ] = True
+                            chimer_edges.add( (rv, rw) )
+                    chimer_nodes.append( n.name )
+                    chimer_nodes.append( reverse_end(n.name) )
+
+        return chimer_nodes, chimer_edges
+
+    def mark_spur_edge(self):
+
+        removed_edges = set()
+        for  v in self.nodes:
+            if len(self.nodes[v].out_edges) > 1:
+                for out_edge in self.nodes[v].out_edges:
+                    w = out_edge.out_node.name
+
+                    if len(self.nodes[w].out_edges) == 0 and self.e_reduce[ (v, w) ] != True:
+                        self.e_reduce[(v, w)] = True
+                        removed_edges.add( (v, w) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(v2, w2)] = True
+                        removed_edges.add( (v2, w2) )
+
+            if len(self.nodes[v].in_edges) > 1:
+                for in_edge in self.nodes[v].in_edges:
+                    w = in_edge.in_node.name
+                    if len(self.nodes[w].in_edges) == 0 and self.e_reduce[ (w, v) ] != True:
+                        self.e_reduce[(w, v)] = True
+                        removed_edges.add( (w, v) )
+                        v2, w2 = reverse_end(w), reverse_end(v)
+                        self.e_reduce[(w2, v2)] = True
+                        removed_edges.add( (w2, v2) )
+        return removed_edges
+
+    def mark_tr_edges(self):
+        """
+        transitive reduction
+        """
+        n_mark = self.n_mark
+        e_reduce = self.e_reduce
+        FUZZ = 500
+        for n in self.nodes:
+            n_mark[n] = "vacant"
+
+        for n_name, node in self.nodes.items():
+
+            out_edges = node.out_edges
+            if len(out_edges) == 0:
+                continue
+
+            out_edges.sort(key=lambda x: x.attr["length"])
+
+            for e in out_edges:
+                w = e.out_node
+                n_mark[ w.name ] = "inplay"
+
+            max_len = out_edges[-1].attr["length"]
+
+            max_len += FUZZ
+
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                if n_mark[w.name] == "inplay":
+                    w.out_edges.sort( key=lambda x: x.attr["length"] )
+                    for e2 in w.out_edges:
+                        if e2.attr["length"] + e_len < max_len:
+                            x = e2.out_node
+                            if n_mark[x.name] == "inplay":
+                                n_mark[x.name] = "eliminated"
+
+            for e in out_edges:
+                e_len = e.attr["length"]
+                w = e.out_node
+                w.out_edges.sort( key=lambda x: x.attr["length"] )
+                if len(w.out_edges) > 0:
+                    x = w.out_edges[0].out_node
+                    if n_mark[x.name] == "inplay":
+                        n_mark[x.name] = "eliminated"
+                for e2 in w.out_edges:
+                    if e2.attr["length"] < FUZZ:
+                        x = e2.out_node
+                        if n_mark[x.name] == "inplay":
+                            n_mark[x.name] = "eliminated"
+
+            for out_edge in out_edges:
+                v = out_edge.in_node
+                w = out_edge.out_node
+                if n_mark[w.name] == "eliminated":
+                    e_reduce[ (v.name, w.name) ] = True
+                    v_name, w_name = reverse_end(w.name), reverse_end(v.name)
+                    e_reduce[(v_name, w_name)] = True
+                n_mark[w.name] = "vacant"
+
+
+    def mark_best_overlap(self):
+        """
+        find the best overlapped edges
+        """
+
+        best_edges = set()
+        removed_edges = set()
+
+        for v in self.nodes:
+
+            out_edges = self.nodes[v].out_edges
+            if len(out_edges) > 0:
+                out_edges.sort(key=lambda e: -e.attr["score"])
+                for e in out_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        break
+
+            in_edges = self.nodes[v].in_edges
+            if len(in_edges) > 0:
+                in_edges.sort(key=lambda e: -e.attr["score"])
+                for e in in_edges:
+                    if self.e_reduce[ (e.in_node.name, e.out_node.name) ] != True:
+                        best_edges.add( (e.in_node.name, e.out_node.name) )
+                        break
+
+        if DEBUG_LOG_LEVEL > 1:
+            print "X", len(best_edges)
+
+        for e_n, e in self.edges.items():
+            v = e_n[0]
+            w = e_n[1]
+            if self.e_reduce[ (v, w) ] != True:
+                if (v, w) not in best_edges:
+                    self.e_reduce[(v, w)] = True
+                    removed_edges.add( (v, w) )
+                    v2, w2 = reverse_end(w), reverse_end(v)
+                    self.e_reduce[(v2, w2)] = True
+                    removed_edges.add( (v2, w2) )
+
+        return removed_edges
+
+    def resolve_repeat_edges(self):
+
+
+        edges_to_reduce = []
+        nodes_to_test = set()
+        for v_n, v in self.nodes.items():
+
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            if len(out_nodes) == 1 and len(in_nodes)  == 1:
+                nodes_to_test.add(v_n)
+
+        for v_n in list( nodes_to_test ):
+
+            v = self.nodes[v_n]
+
+            out_nodes = []
+            for e in v.out_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    out_nodes.append( e.out_node.name )
+
+            in_nodes = []
+            for e in v.in_edges:
+                if self.e_reduce[(e.in_node.name, e.out_node.name)] == False:
+                    in_nodes.append( e.in_node.name )
+
+            in_node_name = in_nodes[0]
+
+            for out_edge in self.nodes[in_node_name].out_edges:
+                vv = out_edge.in_node.name
+                ww = out_edge.out_node.name
+
+                ww_out = self.nodes[ww].out_edges
+                v_out = self.nodes[v_n].out_edges
+                ww_out_nodes = set( [ n.out_node.name for n in ww_out] )
+                v_out_nodes = set(  [ n.out_node.name for n in v_out] )
+                o_overlap = len( ww_out_nodes & v_out_nodes )
+
+                ww_in_count = 0
+                for e in self.nodes[ww].in_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name ) ] == False:
+                        ww_in_count += 1
+
+                if ww != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   ww_in_count > 1 and\
+                   ww not in nodes_to_test and\
+                   o_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+            out_node_name = out_nodes[0]
+
+            for in_edge in self.nodes[out_node_name].in_edges:
+                vv = in_edge.in_node.name
+                ww = in_edge.out_node.name
+
+                vv_in = self.nodes[vv].in_edges
+                v_in = self.nodes[v_n].in_edges
+                vv_in_nodes = set( [ n.in_node.name for n in vv_in] )
+                v_in_nodes = set(  [ n.in_node.name for n in v_in] )
+                i_overlap = len( vv_in_nodes & v_in_nodes )
+
+                vv_out_count = 0
+                for e in self.nodes[vv].out_edges:
+                    if self.e_reduce[ ( e.in_node.name, e.out_node.name )] == False:
+                        vv_out_count += 1
+
+                if vv != v_n and\
+                   self.e_reduce[ (vv, ww) ] == False and\
+                   vv_out_count > 1 and\
+                   vv not in nodes_to_test and\
+                   i_overlap == 0:
+                    edges_to_reduce.append( (vv, ww) )
+
+        removed_edges = set()
+        for e in edges_to_reduce:
+            self.e_reduce[e] = True
+            removed_edges.add(e)
+
+        return removed_edges
+
+    def get_out_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+
+    def get_in_edges_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        return rtn
+
+    def get_best_out_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].out_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+
+        return rtn[-1]
+
+    def get_best_in_edge_for_node(self, name, mask=True):
+        rtn = []
+        for e in self.nodes[name].in_edges:
+            v = e.in_node
+            w = e.out_node
+            if self.e_reduce[ (v.name, w.name) ] == False:
+                rtn.append(e)
+        rtn.sort(key=lambda e: e.attr["score"])
+        return rtn[-1]
+
+
+RCMAP = dict(zip("ACGTacgtNn-","TGCAtgcaNn-"))
+def generate_seq_from_path(sg, seqs, path):
+    subseqs = []
+    r_id, end = path[0].split(":")
+
+    count = 0
+    for i in range( len( path ) -1 ):
+        w_n, v_n = path[i:i+2]
+        edge = sg.edges[ (w_n, v_n ) ]
+        read_id, coor = edge.attr["label"].split(":")
+        b,e = coor.split("-")
+        b = int(b)
+        e = int(e)
+        if b < e:
+            subseqs.append( seqs[read_id][b:e] )
+        else:
+            subseqs.append( "".join( [RCMAP[c] for c in seqs[read_id][b:e:-1]] ) )
+
+    return "".join(subseqs)
+
+
+def reverse_edge( e ):
+    e1, e2 = e
+    return reverse_end(e2), reverse_end(e1)
+
+def reverse_path( p ):
+    p = p[::-1]
+    return [reverse_end(n) for n in p]
+
+
+def find_bundle(ug, u_edge_data, start_node, depth_cutoff, width_cutoff, length_cutoff):
+
+    tips = set()
+    bundle_edges = set()
+    bundle_nodes = set()
+
+    local_graph = nx.ego_graph(ug, start_node, depth_cutoff, undirected=False)
+    length_to_node = {start_node:0}
+    score_to_node = {start_node:0}
+
+    v = start_node
+    end_node = start_node
+
+    if DEBUG_LOG_LEVEL > 1:
+        print
+        print
+        print "start", start_node
+
+    bundle_nodes.add(v)
+    for vv, ww, kk in local_graph.out_edges(v, keys = True):
+        max_score = 0
+        max_length = 0
+
+        if (vv, ww, kk) not in bundle_edges and\
+                reverse_end(ww) not in bundle_nodes:
+
+            bundle_edges.add( (vv, ww, kk) )
+            tips.add(ww)
+
+    for v in list(tips):
+        bundle_nodes.add(v)
+
+    depth = 1
+    width = 1.0
+    converage = False
+
+
+    while 1:
+        if DEBUG_LOG_LEVEL > 1:
+            print "# of tips", len(tips)
+
+        if len(tips) > 4:
+            converage = False
+            break
+
+        if len(tips) == 1:
+            end_node = tips.pop()
+
+            if DEBUG_LOG_LEVEL > 1:
+                print "end", end_node
+
+            if end_node not in length_to_node:
+                v = end_node
+                max_score_edge = None
+                max_score = 0
+                for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                    if uu not in length_to_node:
+                        continue
+
+                    score = u_edge_data[ (uu, vv, kk) ][1]
+
+                    if score > max_score:
+
+                        max_score = score
+                        max_score_edge = (uu, vv, kk)
+
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+
+
+            converage = True
+            break
+
+
+        depth += 1
+        width = 1.0 * len(bundle_edges) / depth
+
+        if depth > 10 and width > width_cutoff:
+            converage = False
+            break
+
+        if depth > depth_cutoff:
+            converage = False
+            break
+
+        tips_list = list(tips)
+
+        tip_updated = False
+        loop_detect = False
+        length_limit_reached = False
+
+        for v in tips_list:
+            if DEBUG_LOG_LEVEL > 1:
+                print "process", v
+
+            if len(local_graph.out_edges(v, keys=True)) == 0: # dead end route
+                print "no out edge", v
+                continue
+
+            max_score_edge = None
+            max_score = 0
+
+            extend_tip = True
+
+            for uu, vv, kk in local_graph.in_edges(v, keys=True):
+                if DEBUG_LOG_LEVEL > 1:
+                    print "in_edges", uu, vv, kk
+                    print uu, "in length_to_node",  uu in length_to_node
+
+                if uu not in length_to_node:
+                    extend_tip = False
+                    break
+
+                score = u_edge_data[ (uu, vv, kk) ][1]
+
+                if score > max_score:
+
+                    max_score = score
+                    max_score_edge = (uu, vv, kk)
+
+            if extend_tip:
+
+                length_to_node[v] = length_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][0]
+                score_to_node[v] = score_to_node[max_score_edge[0]] +  u_edge_data[ max_score_edge ][1]
+
+                if length_to_node[v] > length_cutoff:
+                    length_limit_reached = True
+                    converage = False
+                    break
+
+                v_updated = False
+                for vv, ww, kk in local_graph.out_edges(v, keys=True):
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "test", vv, ww, kk
+
+                    if ww in length_to_node:
+                        loop_detect = True
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "loop_detect", ww
+                        break
+
+                    if (vv, ww, kk) not in bundle_edges and\
+                            reverse_end(ww) not in bundle_nodes:
+
+                        if DEBUG_LOG_LEVEL > 1:
+                            print "add", ww
+
+                        tips.add(ww)
+                        bundle_edges.add( (vv, ww, kk) )
+                        tip_updated = True
+                        v_updated = True
+
+                if v_updated:
+
+                    if DEBUG_LOG_LEVEL > 1:
+                        print "remove", v
+
+                    tips.remove(v)
+
+                    if len(tips) == 1:
+                        break
+
+            if loop_detect:
+                converage = False
+                break
+
+        if length_limit_reached:
+            converage = False
+            break
+
+        if loop_detect:
+            converage = False
+            break
+
+        if not tip_updated:
+            converage = False
+            break
+
+        for v in list(tips):
+            bundle_nodes.add(v)
+
+
+
+    data = start_node, end_node, bundle_edges, length_to_node[end_node], score_to_node[end_node], depth
+
+    data_r = None
+
+    if DEBUG_LOG_LEVEL > 1:
+        print converage, data, data_r
+    return converage, data, data_r
+
+def generate_string_graph(args):
+
+    overlap_file = args.overlap_file
+
+    contained_reads = set()
+    chimer_ids = set()
+
+    filter_reads = False
+
+    seqs = set()
+
+    G=nx.Graph()
+    edges =set()
+    overlap_data = []
+    contained_reads = set()
+    overlap_count = {}
+
+
+    # loop through the overlapping data to load the data in the a python array
+    # contained reads are identified
+
+    with open(overlap_file) as f:
+        for l in f:
+            l = l.strip().split()
+
+
+            #work around for some ill formed data recored
+            #if len(l) != 13:
+            #    continue
+
+            f_id, g_id, score, identity = l[:4]
+
+            if f_id == g_id:  # don't need self-self overlapping
+                continue
+
+            if filter_reads:
+
+                if g_id not in seqs:
+                    continue
+
+                if f_id not in seqs:
+                    continue
+
+            phase1 = l[13]
+            phase2 = l[14]
+
+            score = int(score)
+            identity = float(identity)
+            contained = l[12]
+            if contained == "contained" and phase1 == phase2:
+                contained_reads.add(f_id)
+                continue
+            if contained == "contains" and phase1 == phase2:
+                contained_reads.add(g_id)
+                continue
+            if contained == "none":
+                continue
+
+            if identity < args.min_idt: # only take record with >96% identity as overlapped reads
+                continue
+            f_strain, f_start, f_end, f_len = (int(c) for c in l[4:8])
+            g_strain, g_start, g_end, g_len = (int(c) for c in l[8:12])
+
+            # only used reads longer than the 4kb for assembly
+            if f_len < args.min_len: continue
+            if g_len < args.min_len: continue
+
+            """
+            # double check for proper overlap
+            # this is not necessary when using DALIGNER for overlapper
+            # it may be useful if other overlappers give fuzzier alignment boundary
+            if f_start > 24 and f_len - f_end > 24:  # allow 24 base tolerance on both sides of the overlapping
+                continue
+
+            if g_start > 24 and g_len - g_end > 24:
+                continue
+
+            if g_strain == 0:
+                if f_start < 24 and g_len - g_end > 24:
+                    continue
+                if g_start < 24 and f_len - f_end > 24:
+                    continue
+            else:
+                if f_start < 24 and g_start > 24:
+                    continue
+                if g_start < 24 and f_start > 24:
+                    continue
+            """
+
+            overlap_data.append( (f_id, g_id, score, identity,
+                                  f_strain, f_start, f_end, f_len,
+                                  g_strain, g_start, g_end, g_len,
+                                  phase1, phase2) )
+
+            overlap_count[f_id] = overlap_count.get(f_id,0)+1
+            overlap_count[g_id] = overlap_count.get(g_id,0)+1
+
+    overlap_set = set()
+    sg = StringGraph()
+    for od in overlap_data:
+        f_id, g_id, score, identity = od[:4]
+        if f_id in contained_reads:
+            continue
+        if g_id in contained_reads:
+            continue
+        f_s, f_b, f_e, f_l = od[4:8]
+        g_s, g_b, g_e, g_l = od[8:12]
+
+        phase1 = od[12]
+        phase2 = od[13]
+
+        overlap_pair = [f_id, g_id]
+        overlap_pair.sort()
+        overlap_pair = tuple( overlap_pair )
+        if overlap_pair in overlap_set:  # don't allow duplicated records
+            continue
+        else:
+            overlap_set.add(overlap_pair)
+
+
+        if g_s == 1: # revered alignment, swapping the begin and end coordinates
+            g_b, g_e = g_e, g_b
+
+        # build the string graph edges for each overlap
+        if f_b > 1:
+            if g_b < g_e:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if f_b == 0 or g_e - g_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
+                                                           length = abs(f_b-0),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase2, phase1))
+                sg.add_edge( "%s:E" % f_id, "%s:E" % g_id, label = (g_id, g_e, g_l),
+                                                           length = abs(g_e-g_l),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase1, phase2))
+            else:
+                """
+                     f.B         f.E
+                  f  ----------->
+                  g         <-------------
+                            g.E           g.B
+                """
+                if f_b == 0 or g_e == 0:
+                    continue
+                sg.add_edge( "%s:E" % g_id, "%s:B" % f_id, label = (f_id, f_b, 0),
+                                                           length = abs(f_b -0),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase2, phase1))
+                sg.add_edge( "%s:E" % f_id, "%s:B" % g_id, label = (g_id, g_e, 0),
+                                                           length = abs(g_e- 0),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase1, phase2))
+        else:
+            if g_b < g_e:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         ------------->
+                            g.B           g.E
+                """
+                if g_b == 0 or f_e - f_l == 0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:B" % g_id, label = (g_id, g_b, 0),
+                                                           length = abs(g_b - 0),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase1, phase2))
+                sg.add_edge( "%s:E" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
+                                                           length = abs(f_e-f_l),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase2, phase1))
+            else:
+                """
+                                    f.B         f.E
+                  f                 ----------->
+                  g         <-------------
+                            g.E           g.B
+                """
+                if g_b - g_l == 0 or f_e - f_l ==0:
+                    continue
+                sg.add_edge( "%s:B" % f_id, "%s:E" % g_id, label = (g_id, g_b, g_l),
+                                                           length = abs(g_b - g_l),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase = (phase1, phase2))
+                sg.add_edge( "%s:B" % g_id, "%s:E" % f_id, label = (f_id, f_e, f_l),
+                                                           length = abs(f_e - f_l),
+                                                           score = -score,
+                                                           identity = identity,
+                                                           phase=(phase2, phase1))
+
+
+    cp_edges = sg.remove_cross_phase_edges()
+    sg.init_reduce_dict()
+
+    #if not args.disable_chimer_prediction:
+    #    sg.mark_chimer_edges()
+    #sg.mark_spur_edge()
+
+
+    sg.mark_tr_edges() # mark those edges that transitive redundant
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == True] )
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    chimer_nodes, chimer_edges = sg.mark_chimer_edges()
+
+    removed_edges = set()
+    if args.lfc == True:
+        removed_edges = sg.resolve_repeat_edges()
+    else:
+        removed_edges = sg.mark_best_overlap() # mark those edges that are best overlap edges
+
+    spur_edges = sg.mark_spur_edge()
+
+    if DEBUG_LOG_LEVEL > 1:
+        print sum( [1 for c in sg.e_reduce.values() if c == False] )
+
+    #max_score = max([ sg.edges[ e ].attr["score"] for e in sg.edges if sg.e_reduce[e] != True ])
+    out_f = open("sg_edges_list", "w")
+    nxsg = nx.DiGraph()
+    edge_data = {}
+    for v, w in sg.edges:
+        e = sg.edges[ (v, w) ]
+        rid, sp, tp = e.attr["label"]
+        score = e.attr["score"]
+        identity = e.attr["identity"]
+        length = abs(sp-tp)
+
+
+        if  sg.e_reduce[(v, w)] != True:
+            type_ = "G"
+            label = "%s:%d-%d" % (rid, sp, tp)
+            nxsg.add_edge(v, w, label = label, length = length, score = score)
+            edge_data[ (v, w) ] = (rid, sp, tp, length, score, identity, type_)
+        elif (v, w) in chimer_edges:
+            type_ = "C"
+        elif (v, w) in removed_edges:
+            type_ = "R"
+        elif (v, w) in spur_edges:
+            type_ = "S"
+        elif sg.e_reduce[(v, w)] == True:
+            type_ = "TR"
+
+        print >>out_f, v, w, rid, sp, tp, score, identity, type_
+
+    for k,e in cp_edges.items():
+        v = k[0]
+        w = k[1]
+        rid, sp, tp = e.attr["label"]
+        score = e.attr["score"]
+        identity = e.attr["identity"]
+        length = abs(sp-tp)
+        print >>out_f, v, w, rid, sp, tp, score, identity, "CP"
+
+
+    out_f.close()
+    nxsg_r = nxsg.reverse()
+
+    return nxsg, nxsg_r, edge_data
+
+
+
+def construct_compound_paths(ug, u_edge_data):
+
+    source_nodes = set()
+    sink_nodes = set()
+    simple_nodes = set()
+    branch_nodes = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 0:
+            source_nodes.add(n)
+        if out_degree == 0:
+            sink_nodes.add(n)
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        if in_degree > 1 or out_degree > 1:
+            branch_nodes.add(n)
+
+    #print "#", len(all_nodes),len(source_nodes), len(sink_nodes), len(simple_nodes), len(branch_nodes)
+    compound_paths_0 = []
+    for p in list(branch_nodes):
+        if ug.out_degree(p) > 1:
+            coverage, data, data_r =  find_bundle(ug, u_edge_data, p, 48, 16, 500000)
+            if coverage == True:
+                start_node, end_node, bundle_edges, length, score, depth = data
+                compound_paths_0.append(  (start_node, "NA", end_node, 1.0*len(bundle_edges)/depth, length, score, bundle_edges ) )
+
+    compound_paths_0.sort( key=lambda x: -len(x[6]) )
+
+
+    edge_to_cpath = {}
+    compound_paths_1 = {}
+    for s, v, t, width, length, score, bundle_edges in compound_paths_0:
+        if DEBUG_LOG_LEVEL > 1:
+            print "constructing utg, test ", s,v, t
+
+        overlapped = False
+        for vv, ww, kk in list(bundle_edges):
+            if (vv, ww, kk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped utg", (s, v, t), (vv, ww, kk)
+                overlapped = True
+                break
+            rvv = reverse_end(vv)
+            rww = reverse_end(ww)
+            rkk = reverse_end(kk)
+            if (rww, rvv, rkk) in edge_to_cpath:
+                if DEBUG_LOG_LEVEL > 1:
+                    print "remove overlapped r utg", (s, v, t),  (rww, rvv, rkk)
+                overlapped = True
+                break
+
+
+        if not overlapped:
+            if DEBUG_LOG_LEVEL > 1:
+                print "constructing", s,v, t
+
+            bundle_edges_r = []
+            rs = reverse_end(t)
+            rt = reverse_end(s)
+
+            for vv, ww, kk in list(bundle_edges):
+                edge_to_cpath.setdefault( (vv, ww, kk), set() )
+                edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+                rvv = reverse_end(ww)
+                rww = reverse_end(vv)
+                rkk = reverse_end(kk)
+                edge_to_cpath.setdefault( (rvv, rww, rkk), set() )
+                edge_to_cpath[ (rvv, rww, rkk) ].add( (rs, rt, v) ) #assert v == "NA"
+                bundle_edges_r.append(  (rvv, rww, rkk) )
+
+            compound_paths_1[ ( s, v, t) ] = width, length, score, bundle_edges
+            compound_paths_1[ ( rs, v, rt) ] = width, length, score, bundle_edges_r
+
+
+    compound_paths_2 = {}
+    edge_to_cpath = {}
+    for s, v, t in compound_paths_1:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_1:
+            if DEBUG_LOG_LEVEL > 1:
+                print "non_compliment bundle", s, v, t, len(compound_paths_1[( s, v, t)][-1])
+            continue
+        width, length, score, bundle_edges = compound_paths_1[ (s, v, t) ]
+        compound_paths_2[ (s, v, t) ] = width, length, score, bundle_edges
+        for vv, ww, kk in list(bundle_edges):
+            edge_to_cpath.setdefault( (vv, ww, kk), set() )
+            edge_to_cpath[ (vv, ww, kk) ].add( ( s, t, v) )
+
+
+    compound_paths_3 = {}
+    for k, val in compound_paths_2.items():
+
+        start_node, NA, end_node = k
+        rs = reverse_end(end_node)
+        rt = reverse_end(start_node)
+        assert (rs, "NA", rt) in compound_paths_2
+
+        contained = False
+        for vv, ww, kk in ug.out_edges(start_node, keys=True):
+            if len(edge_to_cpath.get( (vv, ww, kk), [] )) > 1:
+                contained = True
+
+        if not contained:
+            compound_paths_3[k] = val
+            if DEBUG_LOG_LEVEL > 1:
+                print "compound", k
+
+    compound_paths = {}
+    for s, v, t in compound_paths_3:
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        if (rs, "NA", rt) not in compound_paths_3:
+            continue
+        compound_paths[ (s, v, t) ] = compound_paths_3[ (s, v, t) ]
+
+    return compound_paths
+
+def parse_args(argv):
+
+    parser = argparse.ArgumentParser(description='a string graph assembler that is desinged for handling diploid genomes')
+    parser.add_argument('overlap_file', help='a file that contains the overlap information.')
+    parser.add_argument('--min_len', type=int, default=4000,
+                        help='minimum length of the reads to be considered for assembling')
+    parser.add_argument('--min_idt', type=float, default=96,
+                        help='minimum alignment identity of the reads to be considered for assembling')
+    parser.add_argument('--lfc', action="store_true", default=False,
+                        help='use local flow constraint method rather than best overlap method to resolve knots in string graph')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+
+    # transitivity reduction, remove spurs, remove putative edges caused by repeats
+    sg, sg_r, edge_data = generate_string_graph(args)
+
+
+    simple_paths = {}
+    dual_path = {}
+
+
+    sg2 = nx.DiGraph()
+
+    for v, w in edge_data:
+
+        assert (reverse_end(w), reverse_end(v)) in edge_data
+
+        #if (v, w) in masked_edges:
+        #    continue
+
+        rid, sp, tp, length, score, identity, type_ = edge_data[ (v, w) ]
+        if type_ != "G":
+            continue
+
+        label = "%s:%d-%d" % (rid, sp, tp)
+        sg2.add_edge( v, w, label = label, length = length, score = score)
+
+
+    # utg construction phase 1, identify all simple paths
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+
+    all_nodes = sg2.nodes()
+    for n in all_nodes:
+        in_degree = len( sg2.in_edges(n) )
+        out_degree = len( sg2.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+
+
+    free_edges = set(sg2.edges())
+
+    if DEBUG_LOG_LEVEL > 1:
+        for s in list(simple_nodes):
+            print "simple_node", s
+        for s in list(s_nodes):
+            print "s_node", s
+        for s in list(t_nodes):
+            print "t_node", s
+
+        for v,w in free_edges:
+            if (reverse_end(w), reverse_end(v) ) not in free_edges:
+                print "bug", v,w
+                print oreverse_end(w), reverse_end(v)
+
+    while len(free_edges) != 0:
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 1", n
+        else:
+            e = free_edges.pop()
+            free_edges.add(e)
+            n = e[0]
+            if DEBUG_LOG_LEVEL > 1:
+                print "initial utg 2", n
+
+        path = []
+        path_length =0
+        path_score = 0
+        for v, w in sg2.out_edges(n):
+            if (v, w) not in free_edges:
+                continue
+            rv = reverse_end(v)
+            rw = reverse_end(w)
+
+            path_length = 0
+            path_score = 0
+            v0 = v
+            w0 = w
+            path = [v, w]
+            path_edges = set()
+            path_edges.add( (v, w) )
+            path_length += edge_data[ (v, w) ][3]
+            path_score += edge_data[ (v, w) ][4]
+            free_edges.remove( (v, w) )
+
+            r_path_length = 0
+            r_path_score = 0
+            rv0 = rv
+            rw0 = rw
+            r_path = [rv, rw] # need to reverse again
+            r_path_edges = set()
+            r_path_edges.add( (rw, rv) )
+            r_path_length += edge_data[ (rw, rv) ][3]
+            r_path_score += edge_data[ (rw, rv) ][4]
+            free_edges.remove( (rw, rv) )
+
+            while w in simple_nodes:
+                w, w_ = sg2.out_edges(w)[0]
+                if (w, w_) not in free_edges:
+                    break
+                rw_, rw = reverse_end(w_), reverse_end(w)
+
+                if ( rw_, rw ) in path_edges:
+                    break
+
+                path.append(w_)
+                path_edges.add( (w, w_) )
+                path_length += edge_data[ (w, w_) ][3]
+                path_score += edge_data[ (w, w_) ][4]
+                free_edges.remove( (w, w_) )
+
+                r_path.append(rw_)
+                r_path_edges.add( (rw_, rw) )
+                r_path_length += edge_data[ (rw_, rw) ][3]
+                r_path_score += edge_data[ (rw_, rw) ][4]
+                free_edges.remove( (rw_, rw) )
+
+
+                w = w_
+
+            simple_paths[ (v0, w0, path[-1]) ] = path_length, path_score, path
+            r_path.reverse()
+            assert r_path[0] == reverse_end(path[-1])
+            simple_paths[ (r_path[0], rw0, rv0) ] = r_path_length, r_path_score, r_path
+
+            if DEBUG_LOG_LEVEL > 1:
+                print  path_length, path_score, path
+
+            dual_path[ (r_path[0], rw0, rv0) ] = (v0, w0, path[-1])
+            dual_path[ (v0, w0, path[-1]) ] = (r_path[0], rw0, rv0)
+
+
+
+    ug = nx.MultiDiGraph()
+    u_edge_data = {}
+    circular_path = set()
+
+    for s, v, t in simple_paths:
+        length, score, path = simple_paths[ (s, v, t) ]
+        u_edge_data[ (s, t, v) ] = (length, score, path, "simple")
+        if s != t:
+            ug.add_edge(s, t, key = v, type_ = "simple", via = v, length = length, score = score)
+        else:
+            circular_path.add( (s, t, v) )
+
+
+    if DEBUG_LOG_LEVEL > 1:
+        with open("utg_data0","w") as f:
+            for s, t, v in u_edge_data:
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                assert (rs, rt, rv) in u_edge_data
+                length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+
+                if type_ == "compound":
+                    path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+                else:
+                    path_or_edges = "~".join( path_or_edges )
+                print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # identify spurs in the utg graph
+    # Currently, we use ad-hoc logic filtering out shorter utg, but we ca
+    # add proper alignment comparison later to remove redundant utgs
+
+    utg_spurs = set()
+    all_nodes = ug.nodes()
+
+    ug2 = ug.copy()
+
+    s_candidates = set( )
+    for v in ug2.nodes():
+        if ug2.in_degree(v) == 0:
+            s_candidates.add(v)
+
+    while len(s_candidates) > 0:
+        n = s_candidates.pop()
+        if ug2.in_degree(n) != 0:
+            continue
+        n_ego_graph = nx.ego_graph( ug2, n, radius = 10 )
+        n_ego_node_set = set( n_ego_graph.nodes() )
+        for b_node in n_ego_graph.nodes():
+            if ug2.in_degree(b_node) <= 1:
+                continue
+
+            with_extern_node = False
+            b_in_nodes = [e[0] for e in ug2.in_edges(b_node)]
+
+            if len(b_in_nodes) == 1:
+                continue
+
+            for v in  b_in_nodes:
+                if v not in n_ego_node_set:
+                    with_extern_node = True
+                    break
+
+            if not with_extern_node:
+                continue
+
+            s_path = nx.shortest_path( ug2, n, b_node )
+            v1 = s_path[0]
+            total_length = 0
+            for v2 in s_path[1:]:
+                for s, t, v in ug2.out_edges(v1, keys=True):
+                    if t != v2:
+                       continue
+                    length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+                    total_length += length
+                v1 = v2
+
+            if total_length >= 50000:
+                continue
+
+            v1 = s_path[0]
+            for v2 in s_path[1:]:
+                for s, t, v in ug2.out_edges(v1, keys=True):
+                    if t != v2:
+                       continue
+                    length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+                    rs = reverse_end(t)
+                    rt = reverse_end(s)
+                    rv = reverse_end(v)
+                    try:
+                        ug2.remove_edge( s, t, key= v)
+                        ug2.remove_edge( rs, rt, key= rv)
+                        u_edge_data[ (s, t, v) ] = length, score, edges, "spur:2"
+                        u_edge_data[ (rs, rt, rv) ] = length, score, edges, "spur:2"
+                    except Exception:
+                        pass
+
+                if ug2.in_edges(v2) == 0:
+                    s_candidates.add(v2)
+                v1 = v2
+            break
+
+    #phase 2, finding all "consistent" compound paths
+    compound_paths = construct_compound_paths(ug2, u_edge_data)
+    compound_path_file = open("c_path","w")
+
+    ug2_edges = set(ug2.edges(keys = True))
+    edges_to_remove  = set()
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ]
+        print >> compound_path_file, s,v,t, width, length, score, "|".join( [e[0]+"~"+e[2]+"~"+e[1] for e in bundle_edges] )
+        for ss, tt, vv in bundle_edges:
+            if (ss, tt, vv) in ug2_edges:
+                edges_to_remove.add( (ss, tt, vv) )
+
+
+    for s, t, v in edges_to_remove:
+        ug2.remove_edge( s, t ,v )
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        if type_ != "spur":
+            u_edge_data[ (s, t, v) ] = length, score, edges, "contained"
+
+
+    for s, v, t in compound_paths:
+        width, length, score, bundle_edges =  compound_paths[ (s, v, t) ]
+        u_edge_data[ (s, t, v) ] = (length, score, bundle_edges, "compound")
+        ug2.add_edge( s, t, key = v, via = v, type_="compound", length = length, score = score)
+
+        assert v == "NA"
+        rs = reverse_end(t)
+        rt = reverse_end(s)
+        assert (rs, v, rt) in compound_paths
+        dual_path[ (s, v, t) ] = (rs, v, rt)
+        dual_path[ (rs, v, rt) ] = (s, v, t)
+
+    compound_path_file.close()
+
+
+    # remove short utg using local flow consistent rule
+    """
+      short UTG like this can be removed, this kind of utg are likely artifects of repeats
+      >____           _____>
+           \__UTG_>__/
+      <____/         \_____<
+    """
+    ug_edge_to_remove = set()
+    for s, t, v in ug.edges(keys=True):
+        if ug2.in_degree(s) == 1 and ug2.out_degree(s) == 2 and \
+           ug2.in_degree(t) == 2 and ug2.out_degree(t) == 1:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+            if length < 60000:
+                rs = reverse_end(t)
+                rt = reverse_end(s)
+                rv = reverse_end(v)
+                ug_edge_to_remove.add( (s, t, v) )
+                ug_edge_to_remove.add( (rs, rt, rv) )
+    for s, t, v in list(ug_edge_to_remove):
+        ug2.remove_edge(s, t, key=v)
+        length, score, edges, type_ = u_edge_data[ (s, t, v) ]
+        u_edge_data[ (s, t, v) ] = length, score, edges, "repeat_bridge"
+
+    ug = ug2
+
+    with open("utg_data","w") as f:
+        for s, t, v in u_edge_data:
+            length, score, path_or_edges, type_ = u_edge_data[ (s, t, v) ]
+
+            if v == "NA":
+                path_or_edges = "|".join( [ ss+"~"+vv+"~"+tt for ss, tt, vv in path_or_edges ] )
+            else:
+                path_or_edges = "~".join( path_or_edges )
+            print >>f, s, v, t, type_, length, score, path_or_edges
+
+    # contig construction from utgs
+
+    s_nodes = set()
+    t_nodes = set()
+    simple_nodes = set()
+    simple_out = set()
+    simple_in = set()
+
+    all_nodes = ug.nodes()
+    for n in all_nodes:
+        in_degree = len( ug.in_edges(n) )
+        out_degree = len( ug.out_edges(n) )
+        if in_degree == 1 and out_degree == 1:
+            simple_nodes.add(n)
+        else:
+            if out_degree != 0:
+                s_nodes.add(n)
+            if in_degree != 0:
+                t_nodes.add(n)
+        if out_degree == 1:
+            simple_out.add(n)
+        if in_degree == 1:
+            simple_in.add(n)
+
+    all_nodes = set(all_nodes)
+    c_path = []
+
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+    while len(free_edges) != 0:
+
+        if len(s_nodes) != 0:
+            n = s_nodes.pop()
+        else:
+            e = free_edges.pop()
+            n = e[0]
+
+        for s, t, v in ug.out_edges(n, keys=True):
+            path_start = n
+            path_end = None
+            path_key = None
+            path = []
+            path_length = 0
+            path_score = 0
+            path_nodes = set()
+            path_nodes.add(s)
+            if DEBUG_LOG_LEVEL > 1:
+                print "check 1", s, t, v
+            path_key = t
+            t0 = s
+            while t in simple_out:
+                if t in path_nodes:
+                    break
+                rt = reverse_end(t)
+                if rt in path_nodes:
+                    break
+
+                path.append( (t0, t, v) )
+                path_nodes.add(t)
+                length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+                path_length += length
+                path_score += score
+                assert len( ug.out_edges( t, keys=True ) ) == 1 # t is "simple_out" node
+                t0, t, v = ug.out_edges( t, keys=True )[0]
+
+            path.append( (t0, t, v) )
+            length, score, path_or_edges, type_ = u_edge_data[ (t0, t, v) ]
+            path_length += length
+            path_score += score
+            path_nodes.add(t)
+            path_end = t
+
+            c_path.append( (path_start, path_key, path_end, path_length, path_score, path, len(path)) )
+            if DEBUG_LOG_LEVEL > 1:
+                print "c_path", path_start, path_key, path_end, path_length, path_score, len(path)
+            for e in path:
+                if e in free_edges:
+                    free_edges.remove( e )
+
+    if DEBUG_LOG_LEVEL > 1:
+        print "left over edges:", len(free_edges)
+
+
+
+    free_edges = set()
+    for s, t, v in ug.edges(keys=True):
+        free_edges.add( (s, t, v) )
+
+
+    ctg_id = 0
+
+    ctg_paths = open("ctg_paths","w")
+
+    c_path.sort( key=lambda x: -x[3] )
+
+
+    for path_start, path_key, path_end, p_len, p_score, path, n_edges in c_path:
+        length = 0
+        score = 0
+        length_r = 0
+        score_r = 0
+
+        non_overlapped_path = []
+        non_overlapped_path_r = []
+        for s, t, v in path:
+            if v != "NA":
+                rs, rt, rv = reverse_end(t), reverse_end(s), reverse_end(v)
+            else:
+                rs, rt, rv = reverse_end(t), reverse_end(s), "NA"
+            if (s, t, v) in free_edges and (rs, rt, rv) in free_edges:
+                non_overlapped_path.append( (s,t,v) )
+                non_overlapped_path_r.append( (rs, rt, rv)  )
+                length += u_edge_data[ (s, t, v) ][0]
+                score += u_edge_data[ (s, t, v) ][1]
+                length_r += u_edge_data[ (rs, rt, rv) ][0]
+                score_r += u_edge_data[ (rs, rt, rv) ][1]
+            else:
+                break
+
+        if len(non_overlapped_path) == 0:
+            continue
+        s0, t0, v0 = non_overlapped_path[0]
+        end_node = non_overlapped_path[-1][1]
+
+        print >> ctg_paths, "%06dF" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length, score, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path ] )
+        non_overlapped_path_r.reverse()
+        s0, t0, v0 = non_overlapped_path_r[0]
+        end_node = non_overlapped_path_r[-1][1]
+        print >> ctg_paths, "%06dR" % ctg_id, "ctg_linear", s0+"~"+v0+"~"+t0, end_node, length_r, score_r, "|".join([ c[0]+"~"+c[2]+"~"+c[1] for c in non_overlapped_path_r ] )
+        ctg_id += 1
+        for e in non_overlapped_path:
+            if e in free_edges:
+                free_edges.remove(e)
+        for e in non_overlapped_path_r:
+            if e in free_edges:
+                free_edges.remove(e)
+
+
+
+    for s, t, v in list(circular_path):
+        length, score, path, type_ = u_edge_data[ (s, t, v) ]
+        print >> ctg_paths, "%6d" % ctg_id, "ctg_circular", s+"~"+v+"~"+t, t, length, score, s+"~"+v+"~"+t
+        ctg_id += 1
+
+    ctg_paths.close()
diff --git a/FALCON_unzip/falcon_unzip/phasing.py b/FALCON_unzip/falcon_unzip/phasing.py
new file mode 100644
index 0000000..5592869
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phasing.py
@@ -0,0 +1,569 @@
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+        makePypeLocalFile, fn, PypeTask)
+from falcon_kit.FastaReader import FastaReader
+import argparse
+import logging
+import os
+import re
+import shlex
+import subprocess
+import sys
+
+cigar_re = r"(\d+)([MIDNSHP=X])"
+
+def make_het_call(self):
+
+    bam_fn = fn(self.bam_file)
+    ctg_id = self.parameters["ctg_id"]
+    ref_seq = self.parameters["ref_seq"]
+    base_dir = self.parameters["base_dir"]
+    vmap_fn = fn(self.vmap_file)
+    vpos_fn = fn(self.vpos_file)
+    q_id_map_fn = fn(self.q_id_map_file)
+
+    p = subprocess.Popen(shlex.split("samtools view %s %s" % (bam_fn, ctg_id) ), stdout=subprocess.PIPE)
+    pileup = {}
+    q_id_map = {}
+    q_max_id = 0
+    q_id = 0
+    q_name_to_id = {}
+
+    try:
+        os.makedirs("%s/%s" % (base_dir, ctg_id))
+    except OSError:
+        pass
+
+    vmap = open(vmap_fn, "w")
+    vpos = open(vpos_fn, "w")
+
+    for l in p.stdout:
+        l = l.strip().split()
+        if l[0][0] == "@":
+            continue
+
+        QNAME = l[0]
+        if QNAME not in q_name_to_id:
+            q_id = q_max_id
+            q_name_to_id[QNAME] = q_id
+            q_max_id += 1
+
+        q_id = q_name_to_id[QNAME]
+        q_id_map[q_id] = QNAME
+        FLAG = int(l[1])
+        RNAME = l[2]
+        POS = int(l[3]) - 1 # convert to zero base
+        CIGAR = l[5]
+        SEQ = l[9]
+        rp = POS
+        qp = 0
+
+        skip_base = 0
+        total_aln_pos = 0
+        for m in re.finditer(cigar_re, CIGAR):
+            adv = int(m.group(1))
+            total_aln_pos += adv
+
+            if m.group(2)  == "S":
+                skip_base += adv
+
+        if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1:
+            continue
+        if total_aln_pos < 2000:
+            continue
+
+        for m in re.finditer(cigar_re, CIGAR):
+            adv = int(m.group(1))
+            if m.group(2) == "S":
+                qp += adv
+            if m.group(2) == "M":
+                matches = []
+                for i in range(adv):
+                    matches.append( (rp, SEQ[qp]) )
+                    rp += 1
+                    qp += 1
+                matches = matches[1:-1]
+                for pos, b in  matches:
+                    pileup.setdefault(pos, {})
+                    pileup[pos].setdefault(b, [])
+                    pileup[pos][b].append(q_id)
+            elif m.group(2) == "I":
+                for i in range(adv):
+                    qp += 1
+            elif m.group(2) == "D":
+                for i in range(adv):
+                    rp += 1
+
+        pos_k = pileup.keys()
+        pos_k.sort()
+        th = 0.25
+        for pos in pos_k:
+            if pos < POS:
+                if len(pileup[pos]) < 2:
+                    del pileup[pos]
+                    continue
+                base_count = []
+                total_count = 0
+                for b in ["A", "C", "G", "T"]:
+                    count = len(pileup[pos].get(b,[]))
+                    base_count.append( (count, b) )
+                    total_count += count
+                if total_count < 10:
+                    del pileup[pos]
+                    continue
+
+                base_count.sort()
+                base_count.reverse()
+                p0 = 1.0 *  base_count[0][0] / total_count
+                p1 = 1.0 *  base_count[1][0] / total_count
+                if p0 < 1.0 - th and p1 > th:
+                    b0 = base_count[0][1]
+                    b1 = base_count[1][1]
+                    ref_base = ref_seq[pos]
+                    print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count])
+                    for q_id_ in pileup[pos][b0]:
+                        print >> vmap, pos+1, ref_base, b0, q_id_
+                    for q_id_ in pileup[pos][b1]:
+                        print >> vmap, pos+1, ref_base, b1, q_id_
+                del pileup[pos]
+
+
+    q_id_map_f = open(q_id_map_fn, "w")
+    for q_id, q_name in q_id_map.items():
+        print >> q_id_map_f, q_id, q_name
+
+
+def generate_association_table(self):
+
+    vmap_fn = fn(self.vmap_file)
+    atable_fn = fn(self.atable_file)
+    ctg_id = self.parameters["ctg_id"]
+    base_dir = self.parameters["base_dir"]
+
+    vmap = {}
+    v_positions = []
+
+    with open(vmap_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            pos = int(l[0])
+            ref_b = l[1]
+            v_b = l[2]
+            q_id = int(l[3])
+            if (pos, ref_b) not in vmap:
+                v_positions.append( (pos, ref_b) )
+            vmap.setdefault( (pos, ref_b), {} )
+            vmap[ (pos, ref_b) ].setdefault(v_b, [])
+            vmap[ (pos, ref_b) ][v_b].append( q_id )
+
+
+    #xary = []
+    #yary = []
+    with open(atable_fn, "w") as out_f:
+        for i1 in xrange(len(v_positions)):
+            link_count = 0
+            for i2 in xrange(i1+1, len(v_positions)):
+                pos1, rb1 = v_positions[i1]
+                pos2, rb2 = v_positions[i2]
+                if pos2 - pos1 > (1 << 16):
+                    continue
+                ct = {}
+                p1table = []
+                p2table = []
+                s1 = 0
+                list1 = vmap[ (pos1, rb1) ].items()
+                for b1, qids1 in list1:
+                    p1table.append( (b1, len(qids1) ) )
+                    s1 += len(qids1)
+
+                s2 = 0
+                list2 = vmap[ (pos2, rb2) ].items()
+                for b2, qids2 in list2:
+                    p2table.append( (b2, len(qids2) ) )
+                    s2 += len(qids2)
+
+                total_s = 0
+                for b1, qids1 in list1:
+                    for b2, qids2 in list2:
+                        s = len(set(qids1) & set(qids2))
+                        ct[(b1,b2)] = s
+                        total_s += s
+                if total_s < 6:
+                    continue
+
+                b11 = p1table[0][0]
+                b12 = p1table[1][0]
+                b21 = p2table[0][0]
+                b22 = p2table[1][0]
+                print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)]
+
+
+                #xary.append(pos1)
+                #yary.append(pos2)
+                link_count += 1
+                if link_count > 500:
+                    break
+
+def get_score( c_score, pos1, pos2, s1, s2 ):
+    if pos1 > pos2:
+        pos1, pos2 = pos2, pos1
+        s1, s2 = s2, s1
+    b11, b12 = s1
+    b21, b22 = s2
+    return c_score[ (pos1, pos2) ][ (b11+b21, b12+b22) ]
+
+def get_phased_blocks(self):
+    vmap_fn = fn(self.vmap_file)
+    atable_fn = fn(self.atable_file)
+    p_variant_fn = fn(self.phased_variant_file)
+
+    left_connect = {}
+    right_connect = {}
+
+    c_score = {}
+    states = {}
+    positions = set()
+
+
+
+    ref_base = {}
+    with open(vmap_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            pos = int(l[0])
+            ref_b = l[1]
+            v_b = l[2]
+            q_id = int(l[3])
+            ref_base[pos] = ref_b
+
+    with open(atable_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l
+            s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22)
+            if abs(s11+s22-s12-s21) < 6:
+                continue
+            pos1 = int(pos1)
+            pos2 = int(pos2)
+            positions.add(pos1)
+            positions.add(pos2)
+            right_connect.setdefault(pos1, [])
+            right_connect[pos1].append(pos2)
+            left_connect.setdefault(pos2, [])
+            left_connect[pos2].append(pos1)
+            c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22,
+                                        (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 }
+
+
+            if pos1 not in states:
+                st1 = (b11, b12)
+                st2 = (b12, b11)
+                score1 = 0
+                score2 = 0
+                for pp in left_connect.get(pos1,[]):
+                    if pp in states:
+                        st0 = states[pp]
+                    else:
+                        continue
+                    score1 += get_score( c_score, pp, pos1, st0, st1 )
+                    score2 += get_score( c_score, pp, pos1, st0, st2 )
+
+                for pp in right_connect.get(pos1,[]):
+                    if pp in states:
+                        st0 = states[pp]
+                    else:
+                        continue
+                    score1 += get_score( c_score, pos1, pp, st1, st0 )
+                    score2 += get_score( c_score, pos1, pp, st2, st0 )
+
+                if score1 >= score2:
+                    states[pos1] = st1
+                else:
+                    states[pos1] = st2
+
+            if pos2 not in states:
+                st1 = (b21, b22)
+                st2 = (b22, b21)
+                score1 = 0
+                score2 = 0
+                for pp in left_connect.get(pos2,[]):
+                    if pp in states:
+                        st0 = states[pp]
+                    else:
+                        continue
+                    score1 += get_score( c_score, pp, pos2, st0, st1 )
+                    score2 += get_score( c_score, pp, pos2, st0, st2 )
+
+                for pp in right_connect.get(pos2,[]):
+                    if pp in states:
+                        st0 = states[pp]
+                    else:
+                        continue
+                    score1 += get_score( c_score, pos2, pp, st1, st0 )
+                    score2 += get_score( c_score, pos2, pp, st2, st0 )
+
+                if score1 >= score2:
+                    states[pos2] = st1
+                else:
+                    states[pos2] = st2
+
+    positions = list(positions)
+    positions.sort()
+
+
+    iter_count = 0
+    while 1:
+        iter_count += 1
+        if iter_count > 10:
+            break
+        update_count = 0
+        for p in positions:
+            b1, b2 = states[p]
+            st1 = (b1, b2)
+            st2 = (b2, b1)
+
+            score1 = 0
+            score2 = 0
+            for pp in left_connect.get(p,[]):
+                st0 = states[pp]
+                score1 += get_score( c_score, pp, p, st0 ,st1)
+                score2 += get_score( c_score, pp, p, st0, st2)
+
+            #for pp in right_connect.get(p,[]):
+            #    st0 = states[pp]
+            #    score1 += get_score( c_score, p, pp, st1 ,st0)
+            #    score2 += get_score( c_score, p, pp, st2, st0)
+
+            if score1 >= score2:
+                states[p] = st1
+            else:
+                states[p] = st2
+                update_count += 1
+        if update_count == 0:
+            break
+
+
+    right_extent = {}
+    right_score = {}
+    left_extent = {}
+    left_score = {}
+
+
+    for p in positions:
+
+        left_extent[p] = p
+        left_score[p] = 0
+        if p in left_connect:
+            left = p
+            st0 = states[p]
+            st0_ = st0[1], st0[0]
+            for pp in left_connect[p]:
+                st1 = states[pp]
+                s = get_score( c_score, pp, p, st1, st0)
+                s_ = get_score( c_score, pp, p, st1, st0_)
+                left_score[p] += s - s_
+                if s - s_ > 0 and pp < left:
+                    left = pp
+            left_extent[p] = left
+
+        right_extent[p] = p
+        right_score[p] = 0
+        if p in right_connect:
+            right = p
+            st0 = states[p]
+            st0_ = st0[1], st0[0]
+            for pp in right_connect[p]:
+                st1 = states[pp]
+                s = get_score( c_score, p, pp, st0, st1)
+                s_ = get_score( c_score, p, pp, st0_, st1)
+                right_score[p] += s - s_
+                if s - s_ > 0 and pp > right:
+                    right = pp
+            right_extent[p] = right
+
+
+
+
+    phase_block_id = 1
+    phase_blocks = {}
+    pb = []
+
+    max_right_ext = 0
+    for p in positions:
+        if right_score[p] < 10 or left_score[p] < 10:
+            continue
+        b1, b2 = states[p]
+        if max_right_ext < left_extent[p]:
+            if len(pb) > 3:
+                phase_blocks[phase_block_id] = pb
+                phase_block_id += 1
+            pb = []
+        pb.append( (p, b1, b2) )
+        if right_extent[p] > max_right_ext:
+            max_right_ext =  right_extent[p]
+    if len(pb) > 3:
+        phase_blocks[phase_block_id] = pb
+    else:
+        phase_block_id -= 1
+
+
+    with open(p_variant_fn, "w") as out_f:
+        for pid in xrange(1, phase_block_id+1):
+            if len(phase_blocks[pid]) == 0:
+                continue
+            min_ = min( [x[0] for x in phase_blocks[pid]] )
+            max_ = max( [x[0] for x in phase_blocks[pid]] )
+
+            print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid])
+            for p, b1, b2 in phase_blocks[pid]:
+                rb = ref_base[p]
+                print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
+
+def get_phased_reads(self):
+
+    q_id_map_fn = fn(self.q_id_map_file)
+    vmap_fn = fn(self.vmap_file)
+    p_variant_fn = fn(self.phased_variant_file)
+    parameters = self.parameters
+
+    ctg_id = parameters["ctg_id"]
+
+    phased_read_fn = fn(self.phased_read_file)
+
+    rid_map = {}
+    with open(q_id_map_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            rid_map[int(l[0])] = l[1]
+
+
+    read_to_variants = {}
+    variant_to_reads = {}
+    with open(vmap_fn) as f:
+        for l in f:
+            l = l.strip().split()
+            variant = "_".join(l[:3])
+            read_id = int(l[3])
+            read_to_variants.setdefault(read_id, set())
+            read_to_variants[read_id].add(variant)
+            variant_to_reads.setdefault(variant, set())
+            variant_to_reads[variant].add(read_id)
+
+
+    variant_to_phase = {}
+    with open(p_variant_fn) as f:
+        for l in f:
+            """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
+            l = l.strip().split()
+            if l[0] != "V":
+                continue
+            pb_id = int(l[1])
+            variant_to_phase[ l[3] ] = (pb_id, 0)
+            variant_to_phase[ l[4] ] = (pb_id, 1)
+
+    with open(phased_read_fn, "w") as out_f:
+        for r in read_to_variants:
+            vl = {}
+            pl = set()
+            for v in list( read_to_variants[r] ):
+                if v in variant_to_phase:
+                    p = variant_to_phase[v]
+                    vl[ p ] = vl.get(p, 0) + 1
+                    pl.add(p[0])
+            pl = list(pl)
+            pl.sort()
+            for p in pl:
+                if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
+                    print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
+                elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
+                    print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
+
+def phasing(args):
+    bam_fn = args.bam
+    fasta_fn = args.fasta
+    ctg_id = args.ctg_id
+    base_dir = args.base_dir
+
+    ref_seq = ""
+    for r in FastaReader(fasta_fn):
+        rid = r.name.split()[0]
+        if rid != ctg_id:
+            continue
+        ref_seq = r.sequence.upper()
+
+    wf = PypeProcWatcherWorkflow(
+            max_jobs=1,
+    )
+
+    bam_file = makePypeLocalFile(bam_fn)
+    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
+    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
+    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
+    parameters = {}
+    parameters["ctg_id"] = ctg_id
+    parameters["ref_seq"] = ref_seq
+    parameters["base_dir"] = base_dir
+
+    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
+                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
+                         parameters = parameters,
+    ) (make_het_call)
+
+    wf.addTasks([make_het_call_task])
+
+
+
+
+    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
+    parameters = {}
+    parameters["ctg_id"] = ctg_id
+    parameters["base_dir"] = base_dir
+    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
+                                      outputs = { "atable_file": atable_file },
+    ) (generate_association_table)
+
+    wf.addTasks([generate_association_table_task])
+
+
+
+
+    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
+    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
+                                      outputs = { "phased_variant_file": phased_variant_file },
+    ) (get_phased_blocks)
+    wf.addTasks([get_phased_blocks_task])
+
+
+
+
+    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
+    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
+                                                 "q_id_map_file": q_id_map_file,
+                                                 "phased_variant_file": phased_variant_file },
+                                      outputs = { "phased_read_file": phased_read_file },
+                                      parameters = {"ctg_id": ctg_id},
+    ) (get_phased_reads)
+    wf.addTasks([get_phased_reads_task])
+
+
+    wf.refreshTargets()
+    #with open("fc_phasing_wf.dot", "w") as f:
+    #    print >>f, wf.graphvizDot
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='phasing variants and reads from a bam file')
+    # we can run this in parallel mode in the furture
+    #parser.add_argument('--n_core', type=int, default=4,
+    #                    help='number of processes used for generating consensus')
+    parser.add_argument('--bam', type=str, help='path to sorted bam file', required=True)
+    parser.add_argument('--fasta', type=str, help='path to the fasta file of contain the contig', required=True)
+    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
+    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    logging.basicConfig()
+    args = parse_args(argv)
+    phasing(args)
diff --git a/FALCON_unzip/falcon_unzip/phasing_readmap.py b/FALCON_unzip/falcon_unzip/phasing_readmap.py
new file mode 100644
index 0000000..2e30c4b
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/phasing_readmap.py
@@ -0,0 +1,73 @@
+import argparse
+import logging
+import os
+import re
+import sys
+
+
+def get_phasing_readmap(args):
+
+    phased_reads = args.phased_reads
+    read_map_dir = args.read_map_dir
+    the_ctg_id = args.ctg_id
+    base_dir = args.base_dir
+
+    rawread_id_file = os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')
+    pread_id_file = os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')
+    rid_to_oid = open(rawread_id_file).read().split('\n')  #daligner raw read id to the original ids
+    pid_to_fid = open(pread_id_file).read().split('\n')  #daligner pread id to the fake ids
+
+    def pid_to_oid(pid):
+        fid = pid_to_fid[int(pid)]
+        rid = int(fid.split('/')[1])/10
+        return rid_to_oid[int(rid)]
+
+    rid_to_oid = open(rawread_id_file).read().split('\n')  #daligner raw read id to the original ids
+    pid_to_fid = open(pread_id_file).read().split('\n')  #daligner pread id to the fake ids
+
+
+    rid_to_phase = {}
+    with open(phased_reads) as f:
+        for row in f:
+            row = row.strip().split()
+            rid_to_phase[row[6]] = (int(row[2]), int(row[3]))
+
+    arid_to_phase = {}
+    map_fn = os.path.join(read_map_dir, 'pread_to_contigs')
+    with open(map_fn) as f:
+        for row in f:
+            row = row.strip().split()
+            ctg_id = row[1]
+            if not ctg_id.startswith(the_ctg_id):
+                continue
+            if int(row[3]) != 0: #not the best hit
+                continue
+            o_id = pid_to_oid(row[0])
+            phase = rid_to_phase.get( o_id, (-1, 0) )
+            arid_to_phase['%09d' % int(row[0])] = phase
+
+    with open(os.path.join(base_dir, 'rid_to_phase.%s' % the_ctg_id),'w') as f:
+        for arid, phase in arid_to_phase.items():
+            print >>f, arid, the_ctg_id, phase[0], phase[1]
+
+
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='mapping internal daligner read id to phase block and phase',
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    # we can run this in parallel mode in the furture
+    #parser.add_argument('--n_core', type=int, default=4,
+    #                    help='number of processes used for generating consensus')
+    parser.add_argument('--phased_reads', type=str, help='path to read vs. phase map', required=True)
+    parser.add_argument('--read_map_dir', type=str, help='path to the read map directory', required=True)
+    parser.add_argument('--ctg_id', type=str, help='contig identifier in the bam file', required=True)
+    parser.add_argument('--base_dir', type=str, default="./", help='the output base_dir, default to current working directory')
+
+    args = parser.parse_args(argv[1:])
+    return args
+
+
+def main(argv=sys.argv):
+    logging.basicConfig()
+    args = parse_args(argv)
+    get_phasing_readmap(args)
diff --git a/FALCON_unzip/falcon_unzip/rr_hctg_track.py b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
new file mode 100644
index 0000000..fa02434
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/rr_hctg_track.py
@@ -0,0 +1,201 @@
+from __future__ import print_function
+from falcon_kit.multiproc import Pool
+import falcon_kit.util.io as io
+import argparse
+import sys
+import glob
+import os
+from heapq import heappush, heappop, heappushpop
+
+Reader = io.CapturedProcessReaderContext
+
+
+def get_rid_to_ctg(fn):
+    rid_to_ctg = {}
+    with open(fn) as f:
+        for row in f:
+            row = row.strip().split()
+            pid, rid, oid, ctg = row
+            rid_to_ctg.setdefault( rid, set()  )
+            rid_to_ctg[ rid ].add(ctg)
+    return rid_to_ctg
+
+def run_tr_stage1(db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase):
+    cmd = 'LA4Falcon -m %s %s' % (db_fn, fn)
+    reader = Reader(cmd)
+    with reader:
+        return fn, tr_stage1(reader.readlines, min_len, bestn, rid_to_ctg, rid_to_phase)
+
+def tr_stage1(readlines, min_len, bestn, rid_to_ctg, rid_to_phase):
+    """
+    for each read in the b-read column inside the LAS files, we
+    keep top `bestn` hits with a priority queue through all overlaps
+    """
+
+    rtn = {}
+    for l in readlines():
+        l = l.strip().split()
+        q_id, t_id = l[:2]
+        overlap_len = -int(l[2])
+        idt = float(l[3])
+        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
+        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
+        if t_l < min_len:
+            continue
+        if q_id not in rid_to_ctg:
+            continue
+
+        t_phase = rid_to_phase[ int(t_id) ]
+        if t_phase != None:
+            ctg_id, block, phase = t_phase
+            if block != -1:
+                q_phase = rid_to_phase[ int(q_id) ]
+                if q_phase != None:
+                    if q_phase[0] == ctg_id and q_phase[1] == block and q_phase[2] != phase:
+                        continue
+
+        rtn.setdefault(t_id, [])
+        if len(rtn[t_id]) < bestn:
+            heappush(rtn[t_id], (overlap_len, q_id) )
+        else:
+            heappushpop(rtn[t_id], (overlap_len, q_id) )
+
+    return rtn
+
+def run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn):
+    io.LOG('preparing tr_stage1')
+    io.logstats()
+    asm_dir = os.path.abspath(os.path.join(base_dir, '2-asm-falcon'))
+    hasm_dir = os.path.abspath(os.path.join(base_dir, '3-unzip'))
+    quiver_dir = os.path.abspath(os.path.join(base_dir, '4-quiver'))
+    rid_to_ctg = get_rid_to_ctg(os.path.join(quiver_dir, 'read_maps', 'read_to_contig_map' ))
+
+    phased_read_file = os.path.join(hasm_dir, 'all_phased_reads')
+    oid_to_phase = {}
+    with open(phased_read_file) as f:
+        for row in f:
+            row = row.strip().split()
+            ctg_id, block, phase = row[1:4]
+            oid = row[6]
+            block = int(block)
+            phase = int(phase)
+            oid_to_phase[ oid ] = (ctg_id, block, phase)
+    rid_to_phase = {}
+    rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
+    rid_to_oid = open(rawread_ids_fn).read().split('\n')
+    rid_to_phase = [ None ] * len( rid_to_oid )
+    for rid, oid in enumerate(rid_to_oid):
+        rid_to_phase[rid] = oid_to_phase.get( oid, None )
+
+
+    inputs = []
+    for fn in file_list:
+        inputs.append( (run_tr_stage1, db_fn, fn, min_len, bestn, rid_to_ctg, rid_to_phase) )
+    """
+    Aggregate hits from each individual LAS and keep the best n hit.
+    Note that this does not guarantee that the final results is globally the best n hits espcially
+    when the number of `bestn` is too small.  In those case, if there is more hits from single LAS
+    file, then we will miss some good  hits.
+    """
+    bread_to_areads = {}
+    for fn, res in exe_pool.imap(io.run_func, inputs):
+        for k in res:
+            bread_to_areads.setdefault(k, [])
+            for item in res[k]:
+                if len(bread_to_areads[k]) < bestn:
+                    heappush( bread_to_areads[k], item )
+                else:
+                    heappushpop( bread_to_areads[k], item )
+
+    #rid_to_oid = open(os.path.join(rawread_dir, 'dump_rawread_ids', 'rawread_ids')).read().split('\n')
+
+    """
+    For each b-read, we find the best contig map throgh the b->a->contig map.
+    """
+    with open(os.path.join(quiver_dir, 'read_maps/rawread_to_contigs'), 'w') as out_f:
+        for bread in bread_to_areads:
+
+            ctg_score = {}
+            for s, rid in bread_to_areads[bread]:
+                if rid not in rid_to_ctg: continue
+
+                ctgs = rid_to_ctg[rid]
+                for ctg in ctgs:
+                    ctg_score.setdefault(ctg, [0,0])
+                    ctg_score[ctg][0] += -s
+                    ctg_score[ctg][1] += 1
+
+            #oid = rid_to_oid[int(bread)]
+            ctg_score = ctg_score.items()
+            ctg_score.sort( key = lambda k: k[1][0] )
+            rank = 0
+
+            for ctg, score_count in ctg_score:
+                if bread in rid_to_ctg and ctg in rid_to_ctg[bread]:
+                    in_ctg = 1
+                else:
+                    in_ctg = 0
+                score, count = score_count
+                #print(bread, oid, ctg, count, rank, score, in_ctg, file=out_f)
+                print(bread, ctg, count, rank, score, in_ctg, file=out_f)
+                rank += 1
+
+
+
+def try_run_track_reads(n_core, base_dir, min_len, bestn):
+    io.LOG('starting track_reads')
+
+    rawread_dir = os.path.abspath(os.path.join(base_dir, '0-rawreads'))
+
+    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
+    file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las'))
+    io.LOG('file list: %r' % file_list)
+
+    # same, shoud we decide this as a parameter
+    db_fn = os.path.join(rawread_dir, 'raw_reads.db')
+    n_core = min(n_core, len(file_list))
+    exe_pool = Pool(n_core)
+    try:
+        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
+        io.LOG('finished track_reads')
+    except:
+        io.LOG('terminating track_reads workers...')
+        exe_pool.terminate()
+        raise
+
+def track_reads(n_core, base_dir, min_len, bestn, debug, silent, stream):
+    if debug:
+        n_core = 0
+        silent = False
+    if silent:
+        io.LOG = io.write_nothing
+    if stream:
+        global Reader
+        Reader = io.StreamedProcessReaderContext
+
+    try_run_track_reads(n_core, base_dir, min_len, bestn)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='scan the raw read overlap information to identify the best hit from the reads \
+to the contigs with read_to_contig_map generated by `fc_get_read_hctg_map` in `4-quiver/read_maps/read_to_contig_map`. Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
+            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--n_core', type=int, default=4,
+                        help='number of processes used for for tracking reads; '
+                        '0 for main process only')
+    #parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
+    #parser.add_argument('--db', type=str, dest='db_fn', help='read db file path')
+    parser.add_argument('--base_dir', type=str, default="./", help='the base working dir of a FALCON assembly')
+    parser.add_argument('--min_len', type=int, default=2500, help='min length of the reads')
+    parser.add_argument('--stream', action='store_true', help='stream from LA4Falcon, instead of slurping all at once; can save memory for large data')
+    parser.add_argument('--debug', '-g', action='store_true', help='single-threaded, plus other aids to debugging')
+    parser.add_argument('--silent', action='store_true', help='suppress cmd reporting on stderr')
+    parser.add_argument('--bestn', type=int, default=40, help='keep best n hits')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    track_reads(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/FALCON_unzip/falcon_unzip/run_quiver.py b/FALCON_unzip/falcon_unzip/run_quiver.py
new file mode 100644
index 0000000..4eb20e6
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/run_quiver.py
@@ -0,0 +1,394 @@
+from falcon_kit import run_support as support
+from pypeflow.simple_pwatcher_bridge import (
+        PypeLocalFile, makePypeLocalFile, fn,
+        PypeTask,
+        PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
+from falcon_kit.FastaReader import FastaReader
+import glob
+import json
+import logging
+import os
+import pprint
+import re
+import sys
+import time
+import ConfigParser
+
+
+LOG = logging.getLogger(__name__)
+
+def system(call, check=False):
+    LOG.debug('$(%s)' %repr(call))
+    rc = os.system(call)
+    msg = 'Call %r returned %d.' % (call, rc)
+    if rc:
+        LOG.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        LOG.debug(msg)
+    return rc
+
+def mkdir(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def task_track_reads(self):
+    job_done = fn(self.job_done)
+    wd = self.parameters['wd']
+    config = self.parameters['config']
+    input_bam_fofn = config['input_bam_fofn']
+    sge_track_reads = config['sge_track_reads']
+    script_dir = os.path.join(wd)
+    script_fn = os.path.join(script_dir, 'track_reads_h.sh')
+
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+fc_get_read_hctg_map.py --basedir ../..
+fc_rr_hctg_track.py --base_dir ../..
+mkdir -p 4-quiver/reads/
+fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn}
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_track_reads
+
+
+def task_run_quiver(self):
+
+    ref_fasta = fn(self.ref_fasta)
+    read_sam = fn(self.read_sam)
+
+    cns_fasta = fn(self.cns_fasta)
+    cns_fastq = fn(self.cns_fastq)
+    job_done = fn(self.job_done)
+
+    job_uid = self.parameters['job_uid']
+    wd = self.parameters['wd']
+    config = self.parameters['config']
+    ctg_id = self.parameters['ctg_id']
+
+    smrt_bin = config['smrt_bin']
+    sge_quiver = config['sge_quiver']
+    job_type = config['job_type']
+    samtools = os.path.join(smrt_bin, 'samtools')
+    pbalign = os.path.join(smrt_bin, 'pbalign')
+    makePbi = os.path.join(smrt_bin, 'makePbi')
+    variantCaller = os.path.join(smrt_bin, 'variantCaller')
+
+    script_dir = os.path.join(wd)
+    script_fn =  os.path.join(script_dir , 'cns_%s.sh' % (ctg_id))
+
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {wd}
+
+{samtools} faidx {ref_fasta}
+{samtools} view -b -S {read_sam} > {ctg_id}.bam
+{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
+          --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
+          --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
+            {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam
+#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam
+({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
+            -o {cns_fasta} -o {cns_fastq}) || echo quvier failed
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_quiver
+
+def rm(f):
+    system('rm -f {}'.format(f))
+def touch(f):
+    system('touch {}'.format(f))
+def task_cns_zcat(self):
+    gathered_p_ctg_fn = fn(self.gathered_p_ctg)
+    gathered_h_ctg_fn = fn(self.gathered_h_ctg)
+
+    cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta)
+    cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq)
+    cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta)
+    cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq)
+    job_done_fn = fn(self.job_done)
+
+    rm(cns_p_ctg_fasta_fn)
+    touch(cns_p_ctg_fasta_fn)
+    rm(cns_p_ctg_fastq_fn)
+    touch(cns_p_ctg_fastq_fn)
+    with open(gathered_p_ctg_fn) as ifs:
+        for line in ifs:
+            cns_fasta_fn, cns_fastq_fn = line.split()
+            system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals()))
+            system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals()))
+    with open(gathered_p_ctg_fn) as ifs:
+        for line in ifs:
+            cns_fasta_fn, cns_fastq_fn = line.split()
+            rm(cns_fasta_fn)
+            rm(cns_fasta_fn)
+    rm(cns_h_ctg_fasta_fn)
+    touch(cns_h_ctg_fasta_fn)
+    rm(cns_h_ctg_fastq_fn)
+    touch(cns_h_ctg_fastq_fn)
+    with open(gathered_h_ctg_fn) as ifs:
+        for line in ifs:
+            cns_fasta_fn, cns_fastq_fn = line.split()
+            system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals()))
+            system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals()))
+    with open(gathered_h_ctg_fn) as ifs:
+        for line in ifs:
+            cns_fasta_fn, cns_fastq_fn = line.split()
+            rm(cns_fasta_fn)
+            rm(cns_fasta_fn)
+
+    touch(job_done_fn)
+
+def task_scatter_quiver(self):
+    p_ctg_fn = fn(self.p_ctg_fa)
+    h_ctg_fn = fn(self.h_ctg_fa)
+    out_json = fn(self.scattered_quiver_json)
+
+    ref_seq_data = {}
+
+    # I think this will crash if the file is empty. Maybe that is ok.
+    p_ctg_fa = FastaReader(p_ctg_fn)
+    ctg_types = {}
+    for r in p_ctg_fa:
+        rid = r.name.split()[0]
+        ref_seq_data[rid] = r.sequence
+        ctg_types[rid] = 'p'
+
+
+    # I think this will crash if the file is empty. Maybe that is ok.
+    h_ctg_fa = FastaReader(h_ctg_fn)
+    for r in h_ctg_fa:
+        rid = r.name.split()[0]
+        ref_seq_data[rid] = r.sequence
+        ctg_types[rid] = 'h'
+
+    ctg_ids = sorted(ref_seq_data.keys())
+    #p_ctg_out=[]
+    #h_ctg_out=[]
+    #job_done_plfs = {}
+    jobs = []
+    for ctg_id in ctg_ids:
+        sequence = ref_seq_data[ctg_id]
+        m_ctg_id = ctg_id.split('-')[0]
+        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
+        ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
+        read_sam = os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id))
+        #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
+        #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
+        #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
+
+        if os.path.exists(read_sam): # TODO(CD): Ask Jason what we should do if missing SAM. And what about network latency?
+            #if ctg_types[ctg_id] == 'p':
+            #    p_ctg_out.append( (cns_fasta, cns_fastq) )
+            #if ctg_types[ctg_id] == 'h':
+            #    h_ctg_out.append( (cns_fasta, cns_fastq) )
+            mkdir(wd)
+            if not os.path.exists(fn(ref_fasta)):
+                with open(fn(ref_fasta),'w') as f:
+                    print >>f, '>'+ctg_id
+                    print >>f, sequence
+            #parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
+            #make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam},
+            #                           outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
+            #                           parameters = parameters,
+            #                           )
+            #quiver_task = make_quiver_task(task_run_quiver)
+            #wf.addTask(quiver_task)
+            #job_done_plfs['{}'.format(ctg_id)] = job_done
+            new_job = {}
+            new_job['ctg_id'] = ctg_id
+            jobs.append(new_job)
+    open(out_json, 'w').write(json.dumps(jobs))
+
+def create_quiver_jobs(scattered_quiver_plf):
+    scattered_quiver_fn = fn(scattered_quiver_plf)
+    jobs = json.loads(open(scattered_quiver_fn).read())
+    #ctg_ids = sorted(jobs['ref_seq_data'])
+    p_ctg_out=[]
+    h_ctg_out=[]
+    job_done_plfs = {}
+    for job in jobs:
+        ctg_id = job['ctg_id']
+        m_ctg_id = ctg_id.split('-')[0]
+        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
+        ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)))
+        read_sam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id)))
+        cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id)))
+        cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id)))
+        job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id)))
+
+        if os.path.exists(fn(read_sam)): # TODO(CD): Ask Jason what we should do if missing SAM.
+            if ctg_types[ctg_id] == 'p':
+                p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
+            elif ctg_types[ctg_id] == 'h':
+                h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) )
+            else:
+                LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id]))
+            parameters = {'job_uid':'q-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id }
+            make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_sam': read_sam,
+                                         'scattered_quiver': scattered_quiver_plf,
+                                       },
+                                       outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done},
+                                       parameters = parameters,
+            )
+            quiver_task = make_quiver_task(task_run_quiver)
+            wf.addTask(quiver_task)
+            job_done_plfs['{}'.format(ctg_id)] = job_done
+    #sge_quiver = config['sge_quiver']
+    return p_ctg_out, h_ctg_out, job_done_plfs
+
+def task_gather_quiver(self):
+    """We wrote the "gathered" files during task construction.
+    """
+    job_done_fn = fn(self.job_done)
+    touch(job_done_fn)
+
+
+def main(argv=sys.argv):
+    global LOG
+    LOG = support.setup_logger(None)
+
+
+    if len(sys.argv) < 2:
+        print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
+        sys.exit(1)
+
+    config_fn = sys.argv[1]
+    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))
+
+    config = ConfigParser.ConfigParser()
+    config.read(config_fn)
+
+
+    job_type = 'SGE'
+    if config.has_option('General', 'job_type'):
+        job_type = config.get('General', 'job_type')
+
+    sge_track_reads = ' -pe smp 12 -q bigmem'
+    if config.has_option('Unzip', 'sge_track_reads'):
+        sge_track_reads = config.get('Unzip', 'sge_track_reads')
+
+    sge_quiver = ' -pe smp 24 -q bigmem '
+    if config.has_option('Unzip', 'sge_quiver'):
+        sge_quiver = config.get('Unzip', 'sge_quiver')
+
+    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
+    if config.has_option('Unzip', 'smrt_bin'):
+        smrt_bin = config.get('Unzip', 'smrt_bin')
+
+    input_bam_fofn = 'input_bam.fofn'
+    if config.has_option('Unzip', 'input_bam_fofn'):
+        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
+    if not os.path.isabs(input_bam_fofn):
+        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)
+
+
+    quiver_concurrent_jobs = 8
+    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
+        quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')
+
+    config = {'job_type': job_type,
+              'sge_quiver': sge_quiver,
+              'sge_track_reads': sge_track_reads,
+              'input_bam_fofn': input_bam_fofn,
+              'smrt_bin': smrt_bin}
+    LOG.info('config={}'.format(pprint.pformat(config)))
+
+    #support.job_type = 'SGE' #tmp hack until we have a configuration parser
+
+
+    wf = PypeProcWatcherWorkflow(
+            max_jobs=quiver_concurrent_jobs,
+    )
+
+    abscwd = os.path.abspath('.')
+    parameters = {'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config}
+    hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
+    track_reads_h_done_plf = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_h_done'))
+    make_track_reads_task = PypeTask(inputs = {'hasm_done': hasm_done_plf},
+                                     outputs = {'job_done': track_reads_h_done_plf},
+                                     parameters = parameters,
+    )
+    track_reads_task = make_track_reads_task(task_track_reads)
+    #sge_track_reads = config['sge_track_reads']
+
+    wf.addTask(track_reads_task)
+
+    scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
+    make_task = PypeTask(
+            inputs = {
+                'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
+                'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
+                'track_reads_h_done': track_reads_h_done_plf,
+            },
+            outputs = {
+                'scattered_quiver_json': scattered_quiver_plf,
+            },
+            parameters = {},
+    )
+    wf.addTask(make_task(task_scatter_quiver))
+    wf.refreshTargets()
+
+    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(scattered_quiver_plf)
+
+    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
+    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
+    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
+    mkdir('4-quiver/cns_gather')
+    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
+        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
+            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
+    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
+        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
+            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
+
+    make_task = PypeTask(
+            inputs = job_done_plfs,
+            outputs = {
+                'job_done': gather_done_plf,
+            },
+            parameters = {},
+    )
+    wf.addTask(make_task(task_gather_quiver))
+    wf.refreshTargets()
+
+    cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
+    cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
+    cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
+    cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
+    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
+    make_task = PypeTask(
+            inputs = {
+                'gathered_p_ctg': gathered_p_ctg_plf,
+                'gathered_h_ctg': gathered_h_ctg_plf,
+                'gather_done': gather_done_plf,
+            },
+            outputs = {
+                'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
+                'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
+                'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
+                'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
+                'job_done': zcat_done_plf,
+            },
+    )
+    wf.addTask(make_task(task_cns_zcat))
+
+    wf.refreshTargets()
diff --git a/FALCON_unzip/falcon_unzip/select_reads_from_bam.py b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
new file mode 100644
index 0000000..03cbde1
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/select_reads_from_bam.py
@@ -0,0 +1,113 @@
+import pysam
+
+import argparse
+import glob
+import os
+import sys
+
+def select_reads_from_bam(input_bam_fofn_fn, asm_dir, hasm_dir, quiver_dir):
+    read_partition = {}
+    read_to_ctgs = {}
+
+    rawread_to_contigs_fn = os.path.join(quiver_dir, 'read_maps', 'rawread_to_contigs')
+    rawread_ids_fn = os.path.join(asm_dir, 'read_maps', 'dump_rawread_ids', 'rawread_ids')
+    rid_to_oid = open(rawread_ids_fn).read().split('\n')
+    with open(rawread_to_contigs_fn) as f:
+        for row in f:
+            row = row.strip().split()
+            if int(row[3]) >= 1: #keep top one hits
+                continue
+            ctg_id = row[1]
+            if ctg_id == 'NA':
+                continue
+            read_partition.setdefault( ctg_id, set())
+            r_id = row[0]
+            o_id = rid_to_oid[ int(r_id) ]
+            read_partition[ ctg_id ].add(o_id)
+            read_to_ctgs.setdefault(o_id, [])
+            read_to_ctgs[ o_id ].append( (int(row[4]) ,ctg_id) )
+    print "num read_partitions:", len(read_partition)
+    print "num read_to_ctgs:", len(read_to_ctgs)
+
+    header = None
+    fofn_basedir = os.path.normpath(os.path.dirname(input_bam_fofn_fn))
+    def abs_fn(maybe_rel_fn):
+        if os.path.isabs(maybe_rel_fn):
+            return maybe_rel_fn
+        else:
+            return os.path.join(fofn_basedir, maybe_rel_fn)
+    for row in open(input_bam_fofn_fn):
+        fn = abs_fn(row.strip())
+        samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
+        if header == None:
+            header = samfile.header
+        else:
+            header['RG'].extend(samfile.header['RG'])
+        samfile.close()
+
+    PG = header.pop('PG') #remove PG line as there might be a bug that generates no readable chrs
+    #print PG
+
+    #base_dir = os.getcwd()
+    #outfile = pysam.AlignmentFile( os.path.join(base_dir, 'header.sam' ), 'wh', header=header )
+    #outfile.close()
+
+    ctgs = read_partition.keys()
+    ctgs.sort()
+    selected_ctgs = set()
+    for ctg in ctgs:
+        picked_reads = read_partition[ ctg ]
+        print "ctg, len:", ctg, len(picked_reads) # TODO: Is this for debugging?
+        if len(picked_reads) > 20:
+            selected_ctgs.add(ctg)
+
+    outfile = {}
+
+    sam_dir = os.path.join(quiver_dir, 'reads')
+    mkdir(sam_dir)
+    for row in open(input_bam_fofn_fn):
+        fn = abs_fn(row.strip())
+        samfile = pysam.AlignmentFile(fn, 'rb', check_sq = False)
+        for r in samfile.fetch( until_eof = True ):
+            if r.query_name not in read_to_ctgs:
+                #print "Missing:", r.query_name
+                continue
+            ctg_list = read_to_ctgs[ r.query_name ]
+            ctg_list.sort()
+            score, ctg = ctg_list[0]
+            if ctg not in selected_ctgs:
+                #print "Not selected:", ctg
+                continue
+            if ctg not in outfile:
+                samfile_fn = os.path.join(quiver_dir, 'reads', '%s.sam' % ctg)
+                print >>sys.stderr, 'samfile_fn:{!r}'.format(samfile_fn)
+                outfile[ctg] = pysam.AlignmentFile(samfile_fn, 'wh', header=header)
+            outfile[ctg].write(r)
+        samfile.close()
+
+    for ctg in outfile:
+        outfile[ctg].close()
+
+def mkdir(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def parse_args(argv):
+    parser = argparse.ArgumentParser(description='Assume 2-asm-falcon/read_maps/dump_rawread_ids/rawread_ids exists.',
+              formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--basedir', type=str, default='./', help='the base working dir of a FALCON assembly')
+    parser.add_argument('input_bam_fofn', type=str, help='File of BAM filenames. Paths are relative to dir of FOFN, not CWD.')
+    args = parser.parse_args(argv[1:])
+    return args
+
+def main(argv=sys.argv):
+    args = parse_args(argv)
+    basedir = args.basedir
+    input_bam_fofn = args.input_bam_fofn
+    #rawread_dir = os.path.abspath(os.path.join(basedir, '0-rawreads'))
+    #pread_dir = os.path.abspath(os.path.join(basedir, '1-preads_ovl'))
+    asm_dir = os.path.abspath(os.path.join(basedir, '2-asm-falcon'))
+    hasm_dir = os.path.abspath(os.path.join(basedir, '3-unzip'))
+    quiver_dir = os.path.abspath(os.path.join(basedir, '4-quiver'))
+
+    select_reads_from_bam(input_bam_fofn, asm_dir=asm_dir, hasm_dir=hasm_dir, quiver_dir=quiver_dir)
diff --git a/FALCON_unzip/falcon_unzip/unzip.py b/FALCON_unzip/falcon_unzip/unzip.py
new file mode 100644
index 0000000..e585fbc
--- /dev/null
+++ b/FALCON_unzip/falcon_unzip/unzip.py
@@ -0,0 +1,357 @@
+from falcon_kit import run_support as support
+from pypeflow.simple_pwatcher_bridge import (
+        PypeLocalFile, makePypeLocalFile, fn,
+        PypeTask,
+        PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase)
+from falcon_kit.FastaReader import FastaReader
+import glob
+import logging
+import os
+import re
+import sys
+import time
+import ConfigParser
+
+LOG = logging.getLogger(__name__)
+
+def system(call, check=False):
+    LOG.debug('$(%s)' %repr(call))
+    rc = os.system(call)
+    msg = 'Call %r returned %d.' % (call, rc)
+    if rc:
+        LOG.warning(msg)
+        if check:
+            raise Exception(msg)
+    else:
+        LOG.debug(msg)
+    return rc
+
+def mkdir(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+def task_track_reads(self):
+    job_done = fn(self.job_done)
+    wd = self.parameters['wd']
+    config = self.parameters['config']
+    sge_track_reads = config['sge_track_reads']
+    script_dir = os.path.join(wd)
+    script_fn = os.path.join(script_dir , 'track_reads.sh')
+    topdir = '../..'
+
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {topdir}
+python -m falcon_kit.mains.get_read_ctg_map
+python -m falcon_kit.mains.rr_ctg_track
+python -m falcon_kit.mains.pr_ctg_track
+#mkdir -p 3-unzip/reads/
+python -m falcon_kit.mains.fetch_reads
+cd {wd}
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_track_reads
+
+
+def task_run_blasr(self):
+    job_done = fn(self.job_done)
+    ref_fasta = fn(self.ref_fasta)
+    read_fasta = fn(self.read_fasta)
+
+    job_uid = self.parameters['job_uid']
+    wd = self.parameters['wd']
+    ctg_id = self.parameters['ctg_id']
+
+    config = self.parameters['config']
+    smrt_bin = config['smrt_bin']
+    sge_blasr_aln = config['sge_blasr_aln']
+    job_type = config['job_type']
+    blasr = os.path.join(smrt_bin, 'blasr')
+    samtools = os.path.join(smrt_bin, 'samtools')
+
+
+    script_dir = os.path.join(wd)
+    script_fn =  os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id))
+
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+cd {wd}
+time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
+ -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
+ -minMatch 12  -nproc 24 -sam -out tmp_aln.sam
+{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted
+{samtools} index {ctg_id}_sorted.bam
+rm tmp_aln.sam
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_blasr_aln
+
+
+def task_phasing(self):
+    ref_fasta = fn(self.ref_fasta)
+    aln_bam = fn(self.aln_bam)
+
+    job_done = fn(self.job_done)
+
+    job_uid = self.parameters['job_uid']
+    wd = self.parameters['wd']
+    ctg_id = self.parameters['ctg_id']
+
+    config = self.parameters['config']
+    sge_phasing = config['sge_phasing']
+    job_type = config['job_type']
+
+    script_dir = os.path.join(wd)
+    script_fn =  os.path.join(script_dir , 'p_%s.sh' % (ctg_id))
+
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+cd {wd}
+hostname
+date
+cd {wd}
+fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ..
+fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_phasing
+
+
+def task_hasm(self):
+    rid_to_phase_all = fn(self.rid_to_phase_all)
+    job_done = fn(self.job_done)
+    config = self.parameters['config']
+    sge_hasm = config['sge_hasm']
+
+    wd = self.parameters['wd']
+
+    job_type = config['job_type']
+
+    script_dir = os.path.join(wd)
+    script_fn =  os.path.join(script_dir , 'hasm.sh')
+
+    las_fofn = '../../2-asm-falcon/las.fofn'
+    las_fofn = '../../1-preads_ovl/merge-gather/las.fofn'
+    script = """\
+set -vex
+trap 'touch {job_done}.exit' EXIT
+hostname
+date
+cd {wd}
+
+fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl
+fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log
+if [ -e ../../1-preads_ovl/preads4falcon.fasta ];
+then
+  ln -sf ../../1-preads_ovl/preads4falcon.fasta .
+else
+  ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta .
+fi
+fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta
+
+# more script -- a little bit hacky here, we should improve
+
+WD=$PWD
+for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done
+
+## prepare for quviering the haplotig
+cd $WD/..
+if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi
+if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi
+if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi
+if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi
+if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi
+
+find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads
+find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids
+find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges
+find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges
+find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa
+find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa
+cd ../
+date
+touch {job_done}
+""".format(**locals())
+
+    with open(script_fn,'w') as script_file:
+        script_file.write(script)
+    self.generated_script_fn = script_fn
+    #job_data['sge_option'] = sge_hasm
+
+def unzip_all(config):
+    unzip_concurrent_jobs = config['unzip_concurrent_jobs']
+    wf = PypeProcWatcherWorkflow(
+            max_jobs=unzip_concurrent_jobs,
+    )
+    wf.max_jobs = unzip_concurrent_jobs
+
+    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
+    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
+    wdir = os.path.abspath('./3-unzip/reads')
+    parameters = {'wd': wdir, 'config': config}
+
+    job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
+    make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
+                                     outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
+                                     parameters = parameters,
+                                     wdir = wdir,
+    )
+    track_reads_task = make_track_reads_task(task_track_reads)
+
+    wf.addTask(track_reads_task)
+    wf.refreshTargets() #force refresh now, will put proper dependence later
+
+    ctg_ids = []
+    with open('./3-unzip/reads/ctg_list') as f:
+        for row in f:
+            row = row.strip()
+            ctg_ids.append(row)
+
+    aln1_outs = {}
+
+    all_ctg_out = {}
+
+    for ctg_id in ctg_ids:
+        # inputs
+        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
+        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))
+
+        # outputs
+        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
+        #mkdir(wd)
+        blasr_dir = os.path.join(wd, 'blasr')
+        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
+        job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))
+
+        parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id}
+        make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
+                                   outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
+                                   parameters = parameters,
+        )
+        blasr_task = make_blasr_task(task_run_blasr)
+        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
+        wf.addTask(blasr_task)
+
+        phasing_dir = os.path.join(wd, 'phasing')
+        job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
+        rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
+        all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?
+
+        parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id}
+        make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
+                                   outputs = {'job_done': job_done},
+                                   parameters = parameters,
+        )
+        phasing_task = make_phasing_task(task_phasing)
+        wf.addTask(phasing_task)
+
+    wf.refreshTargets()
+
+    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
+    #mkdir(hasm_wd)
+    rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
+    task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
+    ) (get_rid_to_phase_all)
+    wf.addTask(task)
+
+    parameters['wd'] = hasm_wd
+    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
+    make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
+                              outputs = {'job_done': job_done},
+                              parameters = parameters,
+    )
+    hasm_task = make_hasm_task(task_hasm)
+
+    wf.addTask(hasm_task)
+
+    wf.refreshTargets()
+
+def get_rid_to_phase_all(self):
+    # Tasks must be at module scope now.
+    rid_to_phase_all_fn = fn(self.rid_to_phase_all)
+    inputs_fn = [ fn(f) for f in self.inputs.values() ]
+    inputs_fn.sort()
+    output = []
+    for fname in inputs_fn:
+        output.extend(open(fname).read())
+
+    out = open(rid_to_phase_all_fn, 'w')
+    out.write(''.join(output))
+    out.close()
+
+def main(argv=sys.argv):
+    global LOG
+    LOG = support.setup_logger(None)
+
+
+    if len(argv) < 2 or argv[1].startswith('-'):
+        print 'you need to provide a configuration file to specific a couple cluster running environment'
+        sys.exit(1)
+
+    config_fn = argv[1]
+
+    config = ConfigParser.ConfigParser()
+    config.read(config_fn)
+
+    job_type = 'SGE'
+    if config.has_option('General', 'job_type'):
+        job_type = config.get('General', 'job_type')
+
+    sge_blasr_aln = ' -pe smp 24 -q bigmem '
+    if config.has_option('Unzip', 'sge_blasr_aln'):
+        sge_blasr_aln = config.get('Unzip', 'sge_blasr_aln')
+
+    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
+    if config.has_option('Unzip', 'smrt_bin'):
+        smrt_bin = config.get('Unzip', 'smrt_bin')
+
+    sge_phasing = ' -pe smp 12 -q bigmem'
+    if config.has_option('Unzip', 'sge_phasing'):
+        sge_phasing = config.get('Unzip', 'sge_phasing')
+
+    sge_hasm = ' -pe smp 48 -q bigmem'
+    if config.has_option('Unzip', 'sge_hasm'):
+        sge_hasm = config.get('Unzip', 'sge_hasm')
+
+    sge_track_reads = ' -pe smp 12 -q bigmem'
+    if config.has_option('Unzip', 'sge_track_reads'):
+        sge_track_reads = config.get('Unzip', 'sge_track_reads')
+
+    unzip_concurrent_jobs = 8
+    if config.has_option('Unzip', 'unzip_concurrent_jobs'):
+        unzip_concurrent_jobs = config.getint('Unzip', 'unzip_concurrent_jobs')
+
+    config = {'job_type': job_type,
+              'sge_blasr_aln': sge_blasr_aln,
+              'smrt_bin': smrt_bin,
+              'sge_phasing': sge_phasing,
+              'sge_hasm': sge_hasm,
+              'sge_track_reads': sge_track_reads,
+              'unzip_concurrent_jobs':unzip_concurrent_jobs}
+
+    #support.job_type = 'SGE' #tmp hack until we have a configuration parser
+
+    unzip_all(config)
diff --git a/FALCON_unzip/setup.py b/FALCON_unzip/setup.py
new file mode 100755
index 0000000..b54466c
--- /dev/null
+++ b/FALCON_unzip/setup.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+from setuptools import setup
+
+from distutils.core import Extension
+
+import glob
+
+install_requires=[ "networkx >= 1.7", "pysam >= 0.8.4" ]
+
+scripts = glob.glob("src/py_scripts/*.py")
+
+setup(name='falcon_unzip',
+      version='0.1.0',
+      description='Falcon unzip',
+      author='Jason Chin',
+      author_email='jchin at pacificbiosciences.com',
+      packages=['falcon_unzip'],
+      package_dir={'falcon_unzip':'falcon_unzip/'},
+      scripts = scripts,
+      zip_safe = False,
+      install_requires=install_requires
+     )
+
diff --git a/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
new file mode 100644
index 0000000..71b60f5
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_dedup_h_tigs.py
@@ -0,0 +1,4 @@
+from falcon_unzip.dedup_h_tigs import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
new file mode 100644
index 0000000..09af64b
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_get_read_hctg_map.py
@@ -0,0 +1,4 @@
+from falcon_unzip.get_read_hctg_map import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
new file mode 100644
index 0000000..e9fcb1f
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_graphs_to_h_tigs.py
@@ -0,0 +1,4 @@
+from falcon_unzip.graphs_to_h_tigs import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
new file mode 100644
index 0000000..c3a38d2
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_ovlp_filter_with_phase.py
@@ -0,0 +1,6 @@
+from falcon_unzip.ovlp_filter_with_phase import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
+
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
new file mode 100644
index 0000000..2c58c5a
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phased_ovlp_to_graph.py
@@ -0,0 +1,5 @@
+from falcon_unzip.phased_ovlp_to_graph import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing.py b/FALCON_unzip/src/py_scripts/fc_phasing.py
new file mode 100644
index 0000000..70c4711
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phasing.py
@@ -0,0 +1,5 @@
+from falcon_unzip.phasing import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
+
diff --git a/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
new file mode 100644
index 0000000..3fa3592
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_phasing_readmap.py
@@ -0,0 +1,4 @@
+from falcon_unzip.phasing_readmap import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_quiver.py b/FALCON_unzip/src/py_scripts/fc_quiver.py
new file mode 100644
index 0000000..21a5af5
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_quiver.py
@@ -0,0 +1,4 @@
+from falcon_unzip.run_quiver import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
new file mode 100644
index 0000000..b68b43a
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_rr_hctg_track.py
@@ -0,0 +1,4 @@
+from falcon_unzip.rr_hctg_track import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
new file mode 100644
index 0000000..0e85055
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_select_reads_from_bam.py
@@ -0,0 +1,4 @@
+from falcon_unzip.select_reads_from_bam import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
new file mode 100644
index 0000000..f84dc30
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_track_reads_htigs0.py
@@ -0,0 +1,398 @@
+#from pypeflow.common import *
+#from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
+#from pypeflow.task import PypeTask, PypeThreadTaskBase, PypeTaskBase
+#from pypeflow.controller import PypeWorkflow, PypeMPWorkflow, PypeThreadWorkflow
+from pypeflow.simple_pwatcher_bridge import (PypeProcWatcherWorkflow, MyFakePypeThreadTaskBase,
+        makePypeLocalFile, fn, PypeTask)
+PypeThreadTaskBase = MyFakePypeThreadTaskBase
+from falcon_kit.FastaReader import FastaReader
+from falcon_kit.fc_asm_graph import AsmGraph
+import glob
+import sys
+import subprocess as sp
+import shlex
+import os
+
+def make_dirs(d):
+    if not os.path.isdir(d):
+        os.makedirs(d)
+
+rawread_dir = os.path.abspath( "./0-rawreads" )
+pread_dir = os.path.abspath( "./1-preads_ovl" )
+asm_dir = os.path.abspath( os.path.join("./3-unzip/") )
+
+read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
+make_dirs(read_map_dir)
+
+wf = PypeProcWatcherWorkflow(
+        max_jobs=12,
+)
+
+rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
+rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )
+
+ at PypeTask( inputs = {"rawread_db": rawread_db}, 
+           outputs =  {"rawread_id_file": rawread_id_file},
+           TaskType = PypeThreadTaskBase,
+           URL = "task://localhost/dump_rawread_ids" )
+def dump_rawread_ids(self):
+    rawread_db = fn( self.rawread_db )
+    rawread_id_file = fn( self.rawread_id_file )
+    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
+
+wf.addTask( dump_rawread_ids )
+
+pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
+pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
+
+ at PypeTask( inputs = {"pread_db": pread_db}, 
+           outputs =  {"pread_id_file": pread_id_file},
+           TaskType = PypeThreadTaskBase,
+           URL = "task://localhost/dump_pread_ids" )
+def dump_pread_ids(self):
+    pread_db = fn( self.pread_db )
+    pread_id_file = fn( self.pread_id_file )
+    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
+
+wf.addTask( dump_pread_ids )
+wf.refreshTargets() # block
+
+all_raw_las_files = {}
+for las_fn in glob.glob( os.path.join( rawread_dir, "m*/raw_reads.*.las") ):
+    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+    idx = int(idx.split(".")[1]) 
+    las_file = makePypeLocalFile( las_fn )
+    all_raw_las_files["r_las_%s" % idx] = las_file 
+
+all_pread_las_files = {}
+for las_fn in glob.glob( os.path.join( pread_dir, "m*/preads.*.las") ):
+    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+    idx = int(idx.split(".")[1]) 
+    las_file = makePypeLocalFile( las_fn )
+    all_pread_las_files["p_las_%s" % idx] = las_file 
+
+
+
+h_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_edges") )
+p_ctg_edges = makePypeLocalFile( os.path.join(asm_dir, "all_p_ctg_edges") )
+h_ctg_ids = makePypeLocalFile( os.path.join(asm_dir, "all_h_ctg_ids") )
+
+inputs = { "rawread_id_file": rawread_id_file,
+           "pread_id_file": pread_id_file,
+           "h_ctg_edges": h_ctg_edges,
+           "p_ctg_edges": p_ctg_edges,
+           "h_ctg_ids": h_ctg_ids}
+
+read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )
+
+ at PypeTask( inputs = inputs, 
+           outputs = {"read_to_contig_map": read_to_contig_map}, 
+           TaskType = PypeThreadTaskBase, 
+           URL = "task://localhost/get_ctg_read_map" )
+
+def generate_read_to_ctg_map(self):
+    rawread_id_file = fn( self.rawread_id_file )
+    pread_id_file = fn( self.pread_id_file )
+    read_to_contig_map = fn( self.read_to_contig_map )
+    
+    pread_did_to_rid = open(pread_id_file).read().split("\n")
+    rid_to_oid = open(rawread_id_file).read().split("\n")
+
+    h_ctg_edges = fn( self.h_ctg_edges )
+    p_ctg_edges = fn( self.p_ctg_edges )
+
+    h_ctg_ids = set()
+    with open(fn(self.h_ctg_ids)) as f:
+        for row in f:
+            row = row.strip()
+            h_ctg_ids.add( row )
+
+    pread_to_contigs = {}
+
+    for fnanme in ( p_ctg_edges, h_ctg_edges):
+        with open(fnanme) as f:
+            for row in f:
+                row = row.strip().split()
+                ctg = row[0]
+                if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
+                    continue
+                n1 = row[1]
+                n2 = row[2]
+                pid1 = int(n1.split(":")[0])
+                pid2 = int(n2.split(":")[0])
+                rid1 = pread_did_to_rid[pid1].split("/")[1]
+                rid2 = pread_did_to_rid[pid2].split("/")[1]
+                rid1 = int(int(rid1)/10)
+                rid2 = int(int(rid2)/10)
+                oid1 = rid_to_oid[rid1]
+                oid2 = rid_to_oid[rid2]
+                k1 = (pid1, rid1, oid1)
+                pread_to_contigs.setdefault( k1, set() )
+                pread_to_contigs[ k1 ].add( ctg )
+                k2 = (pid2, rid2, oid2)
+                pread_to_contigs.setdefault( k2, set() )
+                pread_to_contigs[ k2 ].add( ctg )
+
+    with open(read_to_contig_map, "w") as f:
+        for k in pread_to_contigs:
+            pid, rid, oid = k
+            for ctg in list(pread_to_contigs[ k ]):
+                print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
+
+wf.addTask( generate_read_to_ctg_map )
+
+def dump_rawread_to_ctg(self):
+    rawread_db = fn(self.rawread_db)
+    rawread_id_file = fn(self.rawread_id_file)
+    phased_read_file = fn(self.phased_reads)
+    las_file = fn(self.las_file)
+    rawread_to_contig_file = fn(self.rawread_to_contig_file)
+    read_to_contig_map = fn(self.read_to_contig_map)
+    rid_to_oid = open(rawread_id_file).read().split('\n')
+
+
+    ovlp_data = []
+    ovlp_count = 0
+    longest_ovlp = 0
+    a_id = None
+    rid_to_contigs = {}
+    
+    with open(read_to_contig_map) as f:
+        for row in f:
+            row = row.strip().split()
+            pid, rid, oid, ctg = row
+            rid = int(rid)
+            rid_to_contigs.setdefault( rid, (oid, set() ) )
+            rid_to_contigs[ rid ][1].add( ctg )
+
+    oid_to_phase = {}
+    with open(phased_read_file) as f:
+        for row in f:
+            row = row.strip().split()
+            ctg_id, block, phase = row[1:4]
+            oid = row[6]
+            block = int(block)
+            phase = int(phase)
+            oid_to_phase[ oid ] = (ctg_id, block, phase)
+
+
+    with open(rawread_to_contig_file, "w") as f:
+        ovlp_data = {}
+        cur_read_id = None
+        skip_rest = 0
+        for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file)) ).splitlines():
+
+            row = row.strip().split()
+            t_id = int(row[1])
+            q_id = int(row[0])
+            if q_id != cur_read_id:
+                if cur_read_id == None:
+                    cur_read_id = q_id
+                else:
+                    if len(ovlp_data) == 0:
+                        o_id = rid_to_oid[ cur_read_id ]
+                        print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
+                    if len(ovlp_data) != 0:
+                        ovlp_v = ovlp_data.values()
+                        ovlp_v.sort()
+                        rank = 0
+                        for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+                            print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+                            rank += 1
+                    ovlp_data = {}
+                    cur_read_id = q_id
+                    skip_rest = 0
+
+            if q_id in rid_to_contigs and len(ovlp_data) == 0: #if the query is already an edge of some contig....
+                t_o_id, ctgs = rid_to_contigs[ q_id ]
+                o_id = rid_to_oid[ q_id ]
+                for ctg in list(ctgs):
+                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
+                    ovlp_data[ctg][0] = -int(row[7]) 
+                    ovlp_data[ctg][1] += 1
+                    skip_rest = 1
+
+            if skip_rest == 1:
+                continue
+
+            if t_id not in rid_to_contigs:
+                continue
+
+            q_phase = oid_to_phase.get( rid_to_oid[q_id], None )
+            if q_phase != None:
+                ctg_id, block, phase = q_phase
+                if block != -1:
+                    t_phase = oid_to_phase.get( rid_to_oid[t_id], None )
+                    if t_phase != None:
+                        if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
+                            continue
+
+            t_o_id, ctgs = rid_to_contigs[ t_id ]
+            o_id = rid_to_oid[ q_id ]
+            
+            for ctg in list(ctgs):
+                ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
+                ovlp_data[ctg][0] += int(row[2])
+                ovlp_data[ctg][1] += 1
+
+        if len(ovlp_data) != 0:
+            ovlp_v = ovlp_data.values()
+            ovlp_v.sort()
+            rank = 0
+            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+                rank += 1
+
+def dump_pread_to_ctg(self):
+    pread_db = fn( self.pread_db )
+    rawread_id_file = fn( self.rawread_id_file )
+    pread_id_file = fn( self.pread_id_file )
+    phased_read_file = fn( self.phased_reads)
+    read_to_contig_map = fn( self.read_to_contig_map )
+    las_file = fn( self.las_file )
+    pread_to_contig_file = fn( self.pread_to_contig_file )
+    read_to_contig_map = fn( self.read_to_contig_map )
+    
+    pid_to_rid = open(pread_id_file).read().split("\n")
+    rid_to_oid = open(rawread_id_file).read().split("\n")
+
+
+    ovlp_data = []
+    ovlp_count = 0
+    longest_ovlp = 0
+    a_id = None
+    pid_to_contigs = {}
+    
+    with open(read_to_contig_map) as f:
+        for row in f:
+            row = row.strip().split()
+            pid, rid, oid, ctg = row
+            pid = int(pid)
+            pid_to_contigs.setdefault( pid, (oid, set() ) )
+            pid_to_contigs[ pid ][1].add( ctg )
+            
+    oid_to_phase = {}
+    with open(phased_read_file) as f:
+        for row in f:
+            row = row.strip().split()
+            ctg_id, block, phase = row[1:4]
+            oid = row[6]
+            block = int(block)
+            phase = int(phase)
+            oid_to_phase[ oid ] = (ctg_id, block, phase)
+
+    with open(pread_to_contig_file, "w") as f:
+        ovlp_data = {}
+        cur_read_id = None
+        skip_rest = 0
+        for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines():
+
+            row = row.strip().split()
+            t_id = int(row[1])
+            q_id = int(row[0])
+            if q_id != cur_read_id:
+                if cur_read_id == None:
+                    cur_read_id = q_id
+                else:
+                    if len(ovlp_data) == 0:
+                        rid = pid_to_rid[cur_read_id].split("/")[1]
+                        rid = int(int(rid)/10)
+                        o_id = rid_to_oid[ rid ]
+                        print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
+                    else:
+                        ovlp_v = ovlp_data.values()
+                        ovlp_v.sort()
+                        rank = 0
+                        for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+                            print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+                            rank += 1
+                    ovlp_data = {}
+                    cur_read_id = q_id
+                    skip_rest = 0
+
+            if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig....
+                t_o_id, ctgs = pid_to_contigs[ q_id ]
+                rid = pid_to_rid[q_id].split("/")[1]
+                rid = int(int(rid)/10)
+                o_id = rid_to_oid[ rid ]
+                for ctg in list(ctgs):
+                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
+                    ovlp_data[ctg][0] = -int(row[7]) 
+                    ovlp_data[ctg][1] += 1
+                skip_rest = 1
+
+            if skip_rest == 1:
+                continue
+
+            if t_id not in pid_to_contigs:
+                continue
+        
+            q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 )
+            q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None )
+            
+            if q_phase != None:
+                ctg_id, block, phase = q_phase
+                if block != -1:
+                    t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 )
+                    t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None )
+                    if t_phase != None:
+                        if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase:
+                            continue
+
+            t_o_id, ctgs = pid_to_contigs[ t_id ]
+            rid = pid_to_rid[q_id].split("/")[1]
+            rid = int(int(rid)/10)
+            o_id = rid_to_oid[ rid ]
+            
+            for ctg in list(ctgs):
+                ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
+                ovlp_data[ctg][0] += int(row[2])
+                ovlp_data[ctg][1] += 1
+
+        if len(ovlp_data) != 0:
+            ovlp_v = ovlp_data.values()
+            ovlp_v.sort()
+            rank = 0
+            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
+                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
+                rank += 1
+
+
+phased_reads =  makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))
+
+
+for las_key, las_file in all_raw_las_files.items():
+    las_fn = fn(las_file)
+    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+    idx = int(idx.split(".")[1]) 
+    rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
+    make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
+                                                    "rawread_db": rawread_db, 
+                                                    "read_to_contig_map": read_to_contig_map, 
+                                                    "rawread_id_file": rawread_id_file,
+                                                    "pread_id_file": pread_id_file,
+                                                    "phased_reads" : phased_reads},
+                                      outputs = { "rawread_to_contig_file": rawread_to_contig_file },
+                                      TaskType = PypeThreadTaskBase,
+                                      URL = "task://localhost/r_read_to_contigs.%s" % idx )
+    dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)                           
+    wf.addTask( dump_rawread_to_ctg_task )
+
+for las_key, las_file in all_pread_las_files.items():
+    las_fn = fn(las_file)
+    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
+    idx = int(idx.split(".")[1]) 
+    pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
+    make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
+                                                  "pread_db": pread_db, 
+                                                  "read_to_contig_map": read_to_contig_map, 
+                                                  "rawread_id_file": rawread_id_file,
+                                                  "pread_id_file": pread_id_file,
+                                                  "phased_reads": phased_reads},
+                                      outputs = { "pread_to_contig_file": pread_to_contig_file },
+                                      TaskType = PypeThreadTaskBase,
+                                      URL = "task://localhost/pread_to_contigs.%s" % idx )
+    dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)                           
+    wf.addTask( dump_pread_to_ctg_task )
+
+wf.refreshTargets() # block
diff --git a/FALCON_unzip/src/py_scripts/fc_unzip.py b/FALCON_unzip/src/py_scripts/fc_unzip.py
new file mode 100644
index 0000000..0e6bb71
--- /dev/null
+++ b/FALCON_unzip/src/py_scripts/fc_unzip.py
@@ -0,0 +1,4 @@
+from falcon_unzip.unzip import main
+import sys
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/FALCON_unzip/src/py_utils/align.sh b/FALCON_unzip/src/py_utils/align.sh
new file mode 100644
index 0000000..2bcb508
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/align.sh
@@ -0,0 +1,10 @@
+~/bin/blasr_yli_0317 000000F_reads.fa 000000F_ref.fa  -noSplitSubreads -clipping subread  -placeRepeatsRandomly -minPctIdentity 70.0  -minMatch 12  -nproc 48 -bam -out aln.bam
+#~/bin/blasr_yli_0317 ../data/chm1_bam/all_merged.bam pa_ctg.fa  -noSplitSubreads -clipping subread  -placeRepeatsRandomly -minPctIdentity 70.0 -minMatch 12 -nproc 48 -bam -out test_chm1.bam 
+#samtools sort -m 120G test_chm13.bam chm13_aln 
+#samtools sort -m 120G test_chm1.bam chm1_aln 
+#python screen_aln_chm1.py
+#python screen_aln_chm13.py
+#bamtools merge -in chm1_aln_sampled.bam -in chm13_aln_sampled.bam -out chm1_chm13_sampled_merged.bam
+#samtools sort -m 120G chm1_chm13_sampled_merged.bam chm1_chm13_sampled_merged_sorted
+#samtools index chm1_chm13_sampled_merged_sorted.bam
+#variantCaller.py --skipUnrecognizedContigs -v -j40 -r pa_ctg.fa  --algorithm=quiver chm1_chm13_sampled_merged_sorted.bam -o cns.fastq.gz -o cns.fasta.gz
diff --git a/FALCON_unzip/src/py_utils/call_SV_from_mum.py b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
new file mode 100644
index 0000000..ac8e441
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/call_SV_from_mum.py
@@ -0,0 +1,85 @@
+import sys
+
+mum_fn = sys.argv[1]
+
+def process_mum_data( pread_id, mum_data ):
+    delta = 0
+    pend = 0
+    qend = 0
+    p_delta = 0
+
+
+    rtn = []
+    for rpos, qpos, len_ in mum_data:
+        rpos = int(rpos)
+        qpos = int(qpos)
+        len_ = int(len_)
+        offset = rpos - qpos
+        
+        if pend == 0:
+            rtn.append( ( rpos, qpos, len_, offset, rpos+len_, 0, 0, 0, pread_id) )
+        else:
+            rtn.append( ( rpos, qpos, len_, offset, rpos+len_, offset-p_offset, rpos-pend, qpos-qend, pread_id) )
+        pend = rpos+len_
+        qend = qpos+len_
+        p_offset = offset
+
+    #break into cluster
+    cluster = []
+    p_delta = 0 
+    for d in rtn:
+        rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+        if q_delta < -10:
+            continue
+        if r_delta < -10:
+            continue
+        if len(cluster) == 0: #start a new cluster
+            cluster.append([])
+            cluster[-1].append( d )
+            #p_delta = delta
+            continue
+        if delta  > 30000:
+            #print delta, p_delta
+            #p_delta = 0
+            cluster.append([])
+            continue
+        cluster[-1].append( d )
+        #p_delta = delta
+    rtn = []
+    for cls_id in xrange(len(cluster)):
+        aln_size = 0
+        qpos_ary = []
+        if len(cluster[cls_id]) == 0:
+            continue
+        for d in cluster[cls_id]:
+            rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+            aln_size += len_
+            qpos_ary.append(qpos)
+        if 1.0*(max(qpos_ary) - min(qpos_ary))/aln_size < 0.5 or aln_size < 2000:
+            continue
+
+        for d in cluster[cls_id]:
+            rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id = d
+            rtn.append( (rpos, qpos, len_, offset, r_end, delta, r_delta, q_delta, pread_id) )
+
+
+    return rtn
+
+
+
+mum_data = []
+with open(mum_fn) as mum_file:
+    for row in mum_file:
+        row = row.strip().split()
+        if row[0] == ">":
+            if len(mum_data) > 0:
+                rtn = process_mum_data( pread_id, mum_data )
+                if len(rtn) > 0:
+                    print ">",pread_id
+                    for d in rtn:
+                        print " ".join([str(c) for c in d])
+            pread_id = row[1]
+            mum_data = []
+            #print 
+        else:
+            mum_data.append( row )
diff --git a/FALCON_unzip/src/py_utils/fetch_read.py b/FALCON_unzip/src/py_utils/fetch_read.py
new file mode 100644
index 0000000..74def7e
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/fetch_read.py
@@ -0,0 +1,12 @@
+from falcon_kit.FastaReader import FastaReader
+
+import sys
+
+f = FastaReader(sys.argv[1])
+rl = set(open(sys.argv[2]).read().split())
+for r in f:
+    rid = r.name.split()[0]
+    if rid not in rl:
+        continue
+    print ">"+rid
+    print r.sequence
diff --git a/FALCON_unzip/src/py_utils/split_bam.py b/FALCON_unzip/src/py_utils/split_bam.py
new file mode 100644
index 0000000..f5562f0
--- /dev/null
+++ b/FALCON_unzip/src/py_utils/split_bam.py
@@ -0,0 +1,28 @@
+import subprocess, shlex
+
+
+read_to_phase = {}
+with open("phased_reads") as f:
+    for l in f:
+        l = l.strip().split()
+        rid = l[-1]
+        phase = l[1] + "_" +l[2]
+        read_to_phase[rid] = phase
+
+p = subprocess.Popen(shlex.split("samtools view -h 000207F.000207F000207F.bam" ), stdout=subprocess.PIPE)
+
+phase_to_file = {}
+header = []
+for l in p.stdout:
+    l2 = l.strip().split()
+    if l2[0][0] == "@":
+        header.append(l.strip())
+    QNAME = l2[0]
+    phase = read_to_phase.get(QNAME, None)
+    if phase != None:
+        pfn = ("./phased_sam/%s" % phase) +".sam"
+        if pfn not in phase_to_file:
+            phase_to_file[pfn] = open(pfn,"w")
+            print >> phase_to_file[pfn], "\n".join(header)
+        print >>  phase_to_file[pfn], l.strip()
+
diff --git a/pypeFLOW/pwatcher/fs_based.py b/pypeFLOW/pwatcher/fs_based.py
index 0ace6c7..e22639f 100755
--- a/pypeFLOW/pwatcher/fs_based.py
+++ b/pypeFLOW/pwatcher/fs_based.py
@@ -404,7 +404,7 @@ class MetaJobSlurm(object):
         sge_option = qstripped(self.mjob.job.options['sge_option'])
         job_queue = self.mjob.job.options['job_queue']
         cwd = os.getcwd()
-        sge_cmd = 'sbatch -J {job_name} -q {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+        sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
                 **locals())
         # "By default all environment variables are propagated."
         #  http://slurm.schedmd.com/sbatch.html
diff --git a/pypeFLOW/pwatcher/network_based.py b/pypeFLOW/pwatcher/network_based.py
index 8866b54..126c773 100755
--- a/pypeFLOW/pwatcher/network_based.py
+++ b/pypeFLOW/pwatcher/network_based.py
@@ -625,7 +625,7 @@ class MetaJobSlurm(object):
         sge_option = qstripped(self.mjob.job.options['sge_option'])
         job_queue = self.mjob.job.options['job_queue']
         cwd = os.getcwd()
-        sge_cmd = 'sbatch -J {job_name} -q {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
+        sge_cmd = 'sbatch -J {job_name} -p {job_queue} {sge_option} -D {cwd} -o stdout -e stderr --wrap="{exe} {script_fn}"'.format(
                 **locals())
         # "By default all environment variables are propagated."
         #  http://slurm.schedmd.com/sbatch.html
diff --git a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
index 3b2fe59..5c068be 100644
--- a/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
+++ b/pypeFLOW/pypeflow/simple_pwatcher_bridge.py
@@ -13,6 +13,7 @@ import pprint
 import random
 import sys
 import time
+import traceback
 
 LOG = logging.getLogger(__name__)
 
@@ -463,13 +464,13 @@ def only_path(p):
         return p.path
     else:
         return p
-def PypeTask(inputs, outputs, TaskType=None, parameters=None, URL=None, wdir=None, name=None):
+def PypeTask(inputs, outputs, parameters=None, wdir=None, name=None):
     """A slightly messy factory because we want to support both strings and PypeLocalFiles, for now.
     This can alter dict values in inputs/outputs if they were not already PypeLocalFiles.
     """
     if wdir is None:
         wdir = find_work_dir([only_path(v) for v in outputs.values()])
-    this = _PypeTask(inputs, outputs, parameters, URL, wdir, name)
+    this = _PypeTask(inputs, outputs, parameters, wdir, name)
     #basedir = os.path.basename(wdir)
     basedir = this.name
     if basedir in PRODUCERS:
@@ -507,15 +508,14 @@ class _PypeTask(object):
         return self
     def __repr__(self):
         return 'PypeTask({!r}, {!r}, {!r}, {!r})'.format(self.name, self.wdir, pprint.pformat(self.outputs), pprint.pformat(self.inputs))
-    def __init__(self, inputs, outputs, parameters=None, URL=None, wdir=None, name=None):
+    def __init__(self, inputs, outputs, parameters=None, wdir=None, name=None):
         if parameters is None:
             parameters = {}
         if wdir is None:
             wdir = parameters.get('wdir', name) # One of these must be a string!
         if name is None:
             name = os.path.relpath(wdir)
-        if URL is None:
-            URL = 'task://localhost/{}'.format(name)
+        URL = 'task://localhost/{}'.format(name)
         self.inputs = inputs
         self.outputs = outputs
         self.parameters = parameters

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/falcon.git